BibTeX bibliography sigarch.bib

%%% -*-BibTeX-*-
%%% ====================================================================
%%%  BibTeX-file{
%%%     author          = "Nelson H. F. Beebe",
%%%     version         = "1.57",
%%%     date            = "01 October 2024",
%%%     time            = "07:11:26 MDT",
%%%     filename        = "sigarch.bib",
%%%     address         = "University of Utah
%%%                        Department of Mathematics, 110 LCB
%%%                        155 S 1400 E RM 233
%%%                        Salt Lake City, UT 84112-0090
%%%                        USA",
%%%     telephone       = "+1 801 581 5254",
%%%     FAX             = "+1 801 581 4148",
%%%     URL             = "https://www.math.utah.edu/~beebe",
%%%     checksum        = "00518 99949 479329 4537365",
%%%     email           = "beebe at math.utah.edu, beebe at acm.org,
%%%                        beebe at computer.org (Internet)",
%%%     codetable       = "ISO/ASCII",
%%%     keywords        = "bibliography; BibTeX; Computer Architecture
%%%                        News; International Symposium on Computer
%%%                        Architecture (ISCA); SIGARCH",
%%%     license         = "public domain",
%%%     supported       = "yes",
%%%     docstring       = "This is an almost complete BibTeX
%%%                        bibliography for ACM SIGARCH Computer
%%%                        Architecture News (CODEN CANED2, ISSN
%%%                        0163-5964 (print), 1943-5851 (electronic)),
%%%                        which began publishing with volume 1, issue
%%%                        1, in January 1972.  The journal appears four
%%%                        to nine times a year, with five annual issues
%%%                        in recent years.  Publication ceased with
%%%                        volume 45, number 2, in May 2017.
%%%
%%%                        The incompleteness is due to holes in the ACM
%%%                        Portal Database: there are at least 8 issues
%%%                        for which no entry at all is present, or
%%%                        there is an issue Web page, but its contents
%%%                        are empty.  The missing issues are:
%%%
%%%                            Volume  1 number 1 1972
%%%                            Volume  1 number 3 1972
%%%                            Volume  2 number 2 1973
%%%                            Volume  5 number 3 1976
%%%                            Volume  5 number 5 1976
%%%                            Volume  8 number 1 1980
%%%                            Volume  9 number 3 1981
%%%                            Volume 36 number 6 2008
%%%
%%%                        The journal has World-Wide Web sites at
%%%
%%%                            https://dl.acm.org/newsletter/sigarch
%%%                            http://www.acm.org/sigarch/
%%%                            http://www.cs.wisc.edu/~arch/www/
%%%
%%%                        with tables of contents at
%%%
%%%                            https://dl.acm.org/loi/sigarch
%%%
%%%                        Some of the ISCA Conferences are jointly
%%%                        sponsored by the ACM and the IEEE, and also
%%%                        appear as an issue of Computer Architecture
%%%                        News. The first ISCA Conference was held in
%%%                        1973.  Tables of contents of the proceedings
%%%                        volumes, and pointers to online article text,
%%%                        may be available at
%%%
%%%                            http://portal.acm.org/browse_dl.cfm?idx=SERIES416
%%%                            http://ieeexplore.ieee.org/xpl/tocresult.jsp?isnumber=30879&isYear=YYYY
%%%
%%%                        where YYYY is a four-digit year.
%%%
%%%                        At version 1.57, the year coverage looked
%%%                        like this:
%%%
%%%                             1972 (   8)    1988 ( 103)    2004 (  77)
%%%                             1973 (  35)    1989 ( 116)    2005 ( 115)
%%%                             1974 (  44)    1990 ( 114)    2006 (  99)
%%%                             1975 (   9)    1991 ( 119)    2007 (  97)
%%%                             1976 (  53)    1992 (  87)    2008 (  90)
%%%                             1977 (  56)    1993 (  64)    2009 (  91)
%%%                             1978 (  33)    1994 (  70)    2010 ( 104)
%%%                             1979 (  16)    1995 (  59)    2011 ( 108)
%%%                             1980 (  67)    1996 (  51)    2012 ( 113)
%%%                             1981 (  18)    1997 (  51)    2013 ( 132)
%%%                             1982 (  81)    1998 ( 129)    2014 ( 120)
%%%                             1983 (  74)    1999 (  54)    2015 (  69)
%%%                             1984 (  60)    2000 (  69)    2016 ( 135)
%%%                             1985 (  67)    2001 (  56)    2017 ( 112)
%%%                             1986 (  66)    2002 (  69)
%%%                             1987 (  87)    2003 (  63)
%%%
%%%                             Article:       3395
%%%                             Book:             1
%%%                             InProceedings:   82
%%%                             Proceedings:     32
%%%
%%%                             Total entries: 3510
%%%
%%%                        This bibliography was constructed primarily
%%%                        from data in the ACM Portal database, and
%%%                        from several on-line library catalogs. The
%%%                        ACM Portal database lacks data for these
%%%                        volume(issue number) pairs: 1(1), 1(3), 2(2),
%%%                        5(3), 5(5), 8(1), 9(3), 36(6), 37(1), and
%%%                        41(1).
%%%
%%%                        Numerous errors in the sources noted above
%%%                        have been corrected.  Spelling has been
%%%                        verified with the UNIX spell and GNU ispell
%%%                        programs using the exception dictionary
%%%                        stored in the companion file with extension
%%%                        .sok.
%%%
%%%                        BibTeX citation tags are uniformly chosen as
%%%                        name:year:abbrev, where name is the family
%%%                        name of the first author or editor, year is a
%%%                        4-digit number, and abbrev is a 3-letter
%%%                        condensation of important title words.
%%%                        Citation labels were automatically generated
%%%                        by software developed for the BibNet Project.
%%%
%%%                        In this bibliography, entries are sorted in
%%%                        publication order, with the help of
%%%                        ``bibsort -byvolume''.  The bibsort utility
%%%                        is available from
%%%
%%%                            https://www.math.utah.edu/pub/bibsort
%%%                            ftp://ftp.math.utah.edu/pub/bibsort
%%%
%%%                        The checksum field above contains a CRC-16
%%%                        checksum as the first value, followed by the
%%%                        equivalent of the standard UNIX wc (word
%%%                        count) utility output of lines, words, and
%%%                        characters.  This is produced by Robert
%%%                        Solovay's checksum utility.",
%%%  }
%%% ====================================================================

@Preamble{
    "\hyphenation{ }" #
    "\ifx \undefined \circled \def \circled #1{(#1)} \fi" #
    "\ifx \undefined \reg     \def \reg       {\circled{R}}\fi" #
    "\ifx \undefined \TM      \def \TM        {${}^{\sc TM}$} \fi"
}

%%% ====================================================================
%%% Acknowledgement abbreviations:

@String{ack-nhfb = "Nelson H. F. Beebe,
                    University of Utah,
                    Department of Mathematics, 110 LCB,
                    155 S 1400 E RM 233,
                    Salt Lake City, UT 84112-0090, USA,
                    Tel: +1 801 581 5254,
                    FAX: +1 801 581 4148,
                    e-mail: \path|beebe@math.utah.edu|,
                            \path|beebe@acm.org|,
                            \path|beebe@computer.org| (Internet),
                    URL: \path|https://www.math.utah.edu/~beebe/|"}

%%% ====================================================================
%%% Journal abbreviations:

@String{j-COMP-ARCH-NEWS = "ACM SIGARCH Computer Architecture News"}

%%% ====================================================================
%%% Publishers and their addresses:

@String{pub-ACM                 = "ACM Press"}
@String{pub-ACM:adr             = "New York, NY 10036, USA"}

@String{pub-IEEE                = "IEEE Computer Society Press"}
@String{pub-IEEE:adr            = "1109 Spring Street, Suite 300,
                                  Silver Spring, MD 20910, USA"}

@String{pub-MORGAN-KAUFMANN     = "Morgan Kaufmann Publishers"}
@String{pub-MORGAN-KAUFMANN:adrsf = "San Francisco, CA, USA"}

%%% ====================================================================
%%% Bibliography entries, in publication order:

%%% TO DO: [04-Sep-2014] Volume 1 number 1: no data yet in ACM Portal database

@Article{Foster:1972:RDM,
  author =       "Caxton C. Foster",
  title =        "A review of dynamic memories with enhanced data access
                 by {Harold S. Stone. IEEETC Vol. C-21, \#4, p 359--386,
                 April 1972}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "1",
  number =       "2",
  pages =        "3--7",
  month =        apr,
  year =         "1972",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:38 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Bataille:1972:SOG,
  author =       "M. Bataille",
  title =        "Something old: the {Gamma 60} the computer that was
                 ahead of its time",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "1",
  number =       "2",
  pages =        "10--15",
  month =        apr,
  year =         "1972",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:38 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Foster:1972:SNI,
  author =       "Caxton C. Foster",
  title =        "Something new: the {Intel MCS-4} micro computer set",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "1",
  number =       "2",
  pages =        "16--17",
  month =        apr,
  year =         "1972",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:38 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Lee:1972:MNC,
  author =       "J. A. N. Lee",
  title =        "My next compiler",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "1",
  number =       "2",
  pages =        "17--19",
  month =        apr,
  year =         "1972",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:38 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Flynn:1972:CAJ,
  author =       "Michael J. Flynn and Mrs. Carol Rogers",
  title =        "Computer architecture at {Johns Hopkins}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "1",
  number =       "2",
  pages =        "21--33",
  month =        apr,
  year =         "1972",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:38 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

%%% TO DO: [04-Sep-2014] Volume 1 number 3: no data yet in ACM Portal database

@Article{Vaughan:1972:CAS,
  author =       "R. F. Vaughan and R. A. Collins",
  title =        "On computer architecture, software portability \&
                 microprogramming",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "1",
  number =       "4",
  pages =        "14--15",
  month =        oct,
  year =         "1972",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:08 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Brakefield:1972:OFP,
  author =       "James C. Brakefield",
  title =        "An optimal floating point format",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "1",
  number =       "4",
  pages =        "16--17",
  month =        oct,
  year =         "1972",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:08 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Brewer:1972:RDD,
  author =       "J. E. Brewer",
  title =        "Recent doctoral dissertations of interest to
                 {SIGARCH}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "1",
  number =       "4",
  pages =        "18--20",
  month =        oct,
  year =         "1972",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:08 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Bettcher:1973:TSR,
  author =       "C. W. Bettcher",
  title =        "Thread standardization and relative cost",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "2",
  number =       "1",
  pages =        "9--9",
  month =        jan,
  year =         "1973",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:28 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "This is a reprint of an article published in the {\em
                 Journal of the Society of Automotive Engineers}, Volume
                 XVIII, Number 2, p. 131, February 1926, about the cost
                 of the lack of standardization of screw threads. {\em
                 Computer Architecture News\/} Editor-in-Chief Caxton C.
                 Foster has added a hand-written note ``of course, there
                 is no message here for {\em us}.''",
}

@Article{Sites:1973:FPS,
  author =       "Richard L. Sites",
  title =        "Floating point significance interrupt proposal",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "2",
  number =       "1",
  pages =        "10--12",
  month =        jan,
  year =         "1973",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:28 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "The purpose of this proposal is to aid numerical
                 analysts in observing the significance of results in
                 floating-point calculations. This proposal is not a
                 cure-all, but it does attempt to a first, high-payoff
                 step in understanding and analyzing floating-point
                 algorithms. This proposal is specifically for IBM
                 360/370 architecture, but the ideas are applicable to
                 all machines.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "The author observes that register clearing by
                 subtraction is common, and is one of the reasons that
                 ``all IBM language processors execute with significance
                 masked off.'' He proposes suppressing the significance
                 interrupt in subtractions when both operands are
                 equal.",
}

@Article{Foster:1973:CA,
  author =       "Caxton Foster",
  title =        "Computer architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "2",
  number =       "1",
  pages =        "13--18",
  month =        jan,
  year =         "1973",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:28 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

%%% TO DO: [04-Sep-2014] Volume 2 number 2: no data yet in ACM Portal database

@Article{Adler:1973:MCC,
  author =       "Louis S. Adler",
  title =        "A mini-computer configuration for {CAI}: a systems
                 engineering view",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "2",
  number =       "3",
  pages =        "10--19",
  month =        oct,
  year =         "1973",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1216456.1216457",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jul 6 14:31:17 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Computer assisted instruction (CAI) has not impacted
                 the educational world with the degree of success which
                 early proponents predicted. Although CAI has proven to
                 be a more efficient learning tool than common
                 traditional methods in specific instances, the overall
                 success of such systems has been sporadic. There is no
                 question that a well-designed and correctly implemented
                 CAI system can be highly effective; however, several
                 important factors must be overcome to guarantee a
                 reasonable amount of success. These are:\par

                 * Overcoming the present high cost of hardware while
                 still providing a reliable system having acceptable
                 display capability.\par

                 * Developing a software real-time operating system
                 which guarantees fast response times.\par

                 * Authoring high quality courseware.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Gentleman:1973:TC,
  author =       "W. M. Gentleman and B. A. Wichmann",
  title =        "Timing on computers",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "2",
  number =       "3",
  pages =        "20--23",
  month =        oct,
  year =         "1973",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1216456.1216458",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jul 6 14:31:17 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  URL =          "ftp://ftp.math.utah.edu/pub/mirrors/ftp.ira.uka.de/bibliography/Misc/monitor.bib",
  abstract =     "Most computers today provide some form of clock which
                 can be read by software. The purpose of this note is to
                 illustrate why in many existing systems, the facilities
                 offered are inadequate for ordinary programmers.
                 Proposals are made for changes in both hardware and
                 software to remedy these deficiencies.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Schank:1973:AAS,
  author =       "Karl Schank",
  title =        "Architectural assistance to software debugging aids",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "2",
  number =       "3",
  pages =        "37--38",
  month =        oct,
  year =         "1973",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1216456.1216459",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jul 6 14:31:17 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "It has been observed [1] that 45 to 50\% of
                 programming effort is spent in debugging, checkout and
                 testing, yet the architecture of most modern computer
                 systems does little if anything to facilitate ease of
                 debugging. In most batch systems the programmer is
                 sufficiently removed from the execution of his program
                 as to be severely handicapped in diagnosing errors.
                 There is only so much information that can be easily
                 obtained from a voluminous core dump, for instance.
                 Even programmers on large timesharing systems have
                 available at most an interactive software debugging
                 package which operates through a combination of
                 insertions and replacements of object code and
                 interpretation (rather than execution) of machine code.
                 This can get to be quite inefficient when carried to
                 the extreme and often is useful only if the program has
                 been processed by a special compiler.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Bhandarkar:1973:MCM,
  author =       "Dileep P. Bhandarkar and Samuel H. Fuller",
  title =        "{Markov} chain models for analyzing memory
                 interference in multiprocessor computer systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "2",
  number =       "4",
  pages =        "1--6",
  month =        dec,
  year =         "1973",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:04 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Anderson:1973:IDP,
  author =       "George A. Anderson",
  title =        "Interconnecting a distributed processor system for
                 avionics",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "2",
  number =       "4",
  pages =        "11--16",
  month =        dec,
  year =         "1973",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:04 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Goke:1973:BNP,
  author =       "L. Rodney Goke and G. J. Lipovski",
  title =        "{Banyan} networks for partitioning multiprocessor
                 systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "2",
  number =       "4",
  pages =        "21--28",
  month =        dec,
  year =         "1973",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:04 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Jordan:1973:SDS,
  author =       "Harry F. Jordan and Burton J. Smith",
  title =        "Structure of digital system description languages",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "2",
  number =       "4",
  pages =        "31--34",
  month =        dec,
  year =         "1973",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:04 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Lee:1973:VDS,
  author =       "John A. N. Lee",
  title =        "{VDL}---a definition system for all levels",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "2",
  number =       "4",
  pages =        "41--48",
  month =        dec,
  year =         "1973",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:04 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Radoy:1973:MPP,
  author =       "Charles H. Radoy and George P. {Copeland, Jr.} and G.
                 J. Lipovski",
  title =        "A methodology for parallel processing design
                 tradeoffs",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "2",
  number =       "4",
  pages =        "51--56",
  month =        dec,
  year =         "1973",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:04 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Reddaway:1973:DDA,
  author =       "S. F. Reddaway",
  title =        "{DAP}---a distributed array processor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "2",
  number =       "4",
  pages =        "61--65",
  month =        dec,
  year =         "1973",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:04 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kogge:1973:MRP,
  author =       "Peter M. Kogge",
  title =        "Maximal rate pipelined solutions to recurrence
                 problems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "2",
  number =       "4",
  pages =        "71--76",
  month =        dec,
  year =         "1973",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:04 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Agerwala:1973:CCL,
  author =       "Tilak Agerwala and Mike Flynn",
  title =        "Comments on capabilities, limitations and
                 ``correctness'' of {Petri} nets",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "2",
  number =       "4",
  pages =        "81--86",
  month =        dec,
  year =         "1973",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:04 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Omohundro:1973:FFC,
  author =       "Wayne E. Omohundro and James H. Tracey",
  title =        "{Flowware}---a flow charting procedure to describe
                 digital networks",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "2",
  number =       "4",
  pages =        "91--97",
  month =        dec,
  year =         "1973",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:04 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Barbacci:1973:AED,
  author =       "Mario R. Barbacci and Daniel P. Siewiorek",
  title =        "Automated exploration of the design space for register
                 transfer {(RT)} systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "2",
  number =       "4",
  pages =        "101--106",
  month =        dec,
  year =         "1973",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:04 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Laliotis:1973:IAS,
  author =       "T. A. Laliotis",
  title =        "Implementation aspects of the symbol hardware
                 compiler",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "2",
  number =       "4",
  pages =        "111--115",
  month =        dec,
  year =         "1973",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:04 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Copeland:1973:ACC,
  author =       "George P. {Copeland, Jr.} and G. J. Lipovski and
                 Stanley Y. W. Su",
  title =        "The architecture of {CASSM}: a cellular system for
                 non-numeric processing",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "2",
  number =       "4",
  pages =        "121--128",
  month =        dec,
  year =         "1973",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:04 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hemphill:1973:DDG,
  author =       "John M. Hemphill and S. A. Szygenda",
  title =        "Deriving design guidelines for diagnosable computer
                 systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "2",
  number =       "4",
  pages =        "131--135",
  month =        dec,
  year =         "1973",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:04 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Parhami:1973:DFT,
  author =       "Behrooz Parhami and Algirdas Avizienis",
  title =        "Design of fault-tolerant associative processors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "2",
  number =       "4",
  pages =        "141--145",
  month =        dec,
  year =         "1973",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:04 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Fischler:1973:FTM,
  author =       "M. A. Fischler and O. Firschein",
  title =        "A fault tolerant multiprocessor architecture for
                 real-time control applications",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "2",
  number =       "4",
  pages =        "151--157",
  month =        dec,
  year =         "1973",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:04 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Lipovski:1973:VFS,
  author =       "G. J. Lipovski",
  title =        "A varistructured fail-soft cellular computer",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "2",
  number =       "4",
  pages =        "161--165",
  month =        dec,
  year =         "1973",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:04 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Vaucher:1973:HLC,
  author =       "Jean Vaucher and Christian Rey",
  title =        "A hardware laboratory for computer architecture
                 research",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "2",
  number =       "4",
  pages =        "171--175",
  month =        dec,
  year =         "1973",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:04 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Knoke:1973:SEC,
  author =       "P. J. Knoke",
  title =        "Simulation exercises for computer architecture
                 education",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "2",
  number =       "4",
  pages =        "181--185",
  month =        dec,
  year =         "1973",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:04 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Sloan:1973:CAC,
  author =       "M. E. Sloan",
  title =        "Computer architecture courses in electrical
                 engineering departments",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "2",
  number =       "4",
  pages =        "191--195",
  month =        dec,
  year =         "1973",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:04 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hartenstein:1973:IHC,
  author =       "R. Hartenstein",
  title =        "Increasing hardware complexity---a challenge to
                 computer architecture education",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "2",
  number =       "4",
  pages =        "201--206",
  month =        dec,
  year =         "1973",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:04 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Rossmann:1973:RWC,
  author =       "George Rossmann",
  title =        "Review of the {{\em Workshop on Computer Architecture
                 Education}}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "2",
  number =       "4",
  pages =        "211--214",
  month =        dec,
  year =         "1973",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:04 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Cooper:1973:MMB,
  author =       "Richard G. Cooper",
  title =        "{Micromodules}: Microprogrammable building blocks for
                 hardware development",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "2",
  number =       "4",
  pages =        "221--226",
  month =        dec,
  year =         "1973",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:04 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Fuller:1973:CMA,
  author =       "S. H. Fuller and D. P. Siewiorek and R. J. Swan",
  title =        "Computer Modules: an architecture for large digital
                 modules",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "2",
  number =       "4",
  pages =        "231--237",
  month =        dec,
  year =         "1973",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:04 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Zaks:1973:MAF,
  author =       "Rodnay Zaks",
  title =        "A microprogrammed architecture for front end
                 processing",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "2",
  number =       "4",
  pages =        "241--246",
  month =        dec,
  year =         "1973",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:04 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Vranesic:1973:DFV,
  author =       "Z. G. Vranesic and V. C. Hamacher and Y. Y. Leung",
  title =        "Design of a fully variable-length structured
                 minicomputer",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "2",
  number =       "4",
  pages =        "251--255",
  month =        dec,
  year =         "1973",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:04 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Marvel:1973:HHA,
  author =       "Orin E. Marvel",
  title =        "Happe {Honeywell Associative Parallel Processing
                 Ensemble}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "2",
  number =       "4",
  pages =        "261--267",
  month =        dec,
  year =         "1973",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:04 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Schaffner:1973:CAP,
  author =       "Mario R. Schaffner",
  title =        "A computer architecture and its programming language",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "2",
  number =       "4",
  pages =        "271--277",
  month =        dec,
  year =         "1973",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:04 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Shore:1974:CCa,
  author =       "John Shore",
  title =        "Conjecture corner",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "3",
  number =       "1",
  pages =        "3--6",
  month =        mar,
  year =         "1974",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:29 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{McKeeman:1974:CDE,
  author =       "W. M. McKeeman",
  title =        "Computer design evaluation using programming language
                 primitives",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "3",
  number =       "1",
  pages =        "7--18",
  month =        mar,
  year =         "1974",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:29 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hartenstein:1974:LMI,
  author =       "Reiner W. Hartenstein",
  title =        "Letter to membership from incoming chairman {(CAN,
                 Oct. 73)}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "3",
  number =       "1",
  pages =        "19--22",
  month =        mar,
  year =         "1974",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:29 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Stryker:1974:SSA,
  author =       "David Stryker and David Weiss",
  title =        "Secure system architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "3",
  number =       "2",
  pages =        "37--38",
  month =        jun,
  year =         "1974",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:41 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Su:1974:BRL,
  author =       "Stephen Y. H. Su",
  title =        "Book review of {{\em Logic and Logic Design\/}} by {B.
                 Girling and H. G. Morning. International Textbook
                 Company Limited 1973}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "3",
  number =       "3",
  pages =        "2--3",
  month =        sep,
  year =         "1974",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:02 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Shore:1974:CCb,
  author =       "John Shore",
  title =        "Conjecture corner",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "3",
  number =       "3",
  pages =        "4--9",
  month =        sep,
  year =         "1974",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:02 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Nisnevich:1974:DPC,
  author =       "L. Nisnevich and E. Strasbourger",
  title =        "Decentralized priority control in data communication",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "3",
  number =       "4",
  pages =        "1--6",
  month =        dec,
  year =         "1974",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:04 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Reames:1974:LNS,
  author =       "Cecil C. Reames and Ming T. Liu",
  title =        "A loop network for simultaneous transmission of
                 variable-length messages",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "3",
  number =       "4",
  pages =        "7--12",
  month =        dec,
  year =         "1974",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:04 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Callan:1974:APS,
  author =       "James F. Callan",
  title =        "The architecture of the {Picture System}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "3",
  number =       "4",
  pages =        "13--16",
  month =        dec,
  year =         "1974",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:04 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "Evans \& Sutherland Picture System",
}

@Article{Staudhammer:1974:FDO,
  author =       "John Staudhammer and Jeffrey F. Eastman and James N.
                 England",
  title =        "A fast display-oriented processor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "3",
  number =       "4",
  pages =        "17--22",
  month =        dec,
  year =         "1974",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:04 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Eastman:1974:CDC,
  author =       "Jeffrey F. Eastman and John Staudhammer",
  title =        "Computer display of colored three-dimensional
                 objects",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "3",
  number =       "4",
  pages =        "23--27",
  month =        dec,
  year =         "1974",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:04 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kerr:1974:MPI,
  author =       "Henry D. Kerr",
  title =        "A microprogrammed processor for interactive computer
                 graphics",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "3",
  number =       "4",
  pages =        "28--33",
  month =        dec,
  year =         "1974",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:04 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Armstrong:1974:FMT,
  author =       "C. V. W. Armstrong",
  title =        "Functional memory techniques applied to the
                 microprogrammed control of an associative processor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "3",
  number =       "4",
  pages =        "34--40",
  month =        dec,
  year =         "1974",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:04 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Wade:1974:IDM,
  author =       "James F. Wade and Paul D. Stigall",
  title =        "Instruction design to minimize program size",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "3",
  number =       "4",
  pages =        "41--44",
  month =        dec,
  year =         "1974",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:04 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Bondi:1974:HHM,
  author =       "James O. Bondi and Paul D. Stigall",
  title =        "{HMO}, a hardware microcode optimizer",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "3",
  number =       "4",
  pages =        "45--51",
  month =        dec,
  year =         "1974",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:04 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Peskin:1974:CAD,
  author =       "A. M. Peskin",
  title =        "The computer aided design of processor architectures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "3",
  number =       "4",
  pages =        "51--55",
  month =        dec,
  year =         "1974",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:04 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Huen:1974:IPR,
  author =       "W. H. Huen and D. P. Siewiorek",
  title =        "Intermodule protocol for register transfer level
                 modules: representation and analytic tools",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "3",
  number =       "4",
  pages =        "56--62",
  month =        dec,
  year =         "1974",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:04 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Isaacson:1974:PSP,
  author =       "Portia Isaacson",
  title =        "Picture systems, {PS}, and the design of a
                 channel-to-channel computer interface",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "3",
  number =       "4",
  pages =        "63--70",
  month =        dec,
  year =         "1974",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:04 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Lofgren:1974:RCT,
  author =       "Lennart L{\"o}fgren",
  title =        "Reference concepts in a tree structured address
                 space",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "3",
  number =       "4",
  pages =        "71--79",
  month =        dec,
  year =         "1974",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:04 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Anderson:1974:VMM,
  author =       "Judith A. Anderson and G. J. Lipovski",
  title =        "A virtual memory for microprocessors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "3",
  number =       "4",
  pages =        "80--84",
  month =        dec,
  year =         "1974",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:04 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Brundage:1974:PED,
  author =       "R. E. Brundage and A. P. Batson",
  title =        "The performance enhancement of descriptor-based
                 virtual memory systems through the use of associative
                 registers",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "3",
  number =       "4",
  pages =        "85--90",
  month =        dec,
  year =         "1974",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:04 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Marvel:1974:SSP,
  author =       "Orin E. Marvel",
  title =        "{SPEAC}: special purpose electronic area correlator",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "3",
  number =       "4",
  pages =        "91--94",
  month =        dec,
  year =         "1974",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:04 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Satterfield:1974:AAS,
  author =       "James M. Satterfield",
  title =        "Architectural advances of the space shuttle orbiter
                 avionics computer system",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "3",
  number =       "4",
  pages =        "95--98",
  month =        dec,
  year =         "1974",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:04 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kodres:1974:DSA,
  author =       "Uno R. Kodres and William L. McCracken",
  title =        "Design study of an avionics navigation microcomputer",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "3",
  number =       "4",
  pages =        "99--105",
  month =        dec,
  year =         "1974",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:04 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kane:1974:ISI,
  author =       "Gerald R. Kane",
  title =        "An iteratively structured information processor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "3",
  number =       "4",
  pages =        "106--112",
  month =        dec,
  year =         "1974",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:04 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Richards:1974:HSI,
  author =       "H. {Richards, Jr.} and A. E. Oldehoeft",
  title =        "Hardware-software interactions in {SYMBOL-2R}'s
                 operating system",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "3",
  number =       "4",
  pages =        "113--118",
  month =        dec,
  year =         "1974",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:04 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Sylvain:1974:DEA,
  author =       "Pierre Sylvain and Maniel Vineberg",
  title =        "The design and evaluation of the array machine: a
                 high-level language processor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "3",
  number =       "4",
  pages =        "119--125",
  month =        dec,
  year =         "1974",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:04 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Dennis:1974:PAB,
  author =       "Jack B. Dennis and David P. Misunas",
  title =        "A preliminary architecture for a basic data-flow
                 processor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "3",
  number =       "4",
  pages =        "126--132",
  month =        dec,
  year =         "1974",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:04 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Berkling:1974:RLR,
  author =       "K. J. Berkling",
  title =        "Reduction languages for reduction machines",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "3",
  number =       "4",
  pages =        "133--140",
  month =        dec,
  year =         "1974",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:04 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{King:1974:ODS,
  author =       "Willis K. King and Fulvio Carbonaro",
  title =        "Output devices sharing by minicomputers",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "3",
  number =       "4",
  pages =        "141--145",
  month =        dec,
  year =         "1974",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:04 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Rannem:1974:RSC,
  author =       "S. Rannem and V. C. Hamacher and S. G. Zaky and P.
                 Connolly",
  title =        "On relating small computer performance to design
                 parameters",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "3",
  number =       "4",
  pages =        "146--151",
  month =        dec,
  year =         "1974",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:04 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Lawson:1974:ASH,
  author =       "Harold W. {Lawson, Jr.} and Bengt Magnhagen",
  title =        "Advantages of structured hardware",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "3",
  number =       "4",
  pages =        "152--158",
  month =        dec,
  year =         "1974",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:04 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kornerup:1974:CMS,
  author =       "Peter Kornerup",
  title =        "Concepts of the {MATHILDA} system",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "3",
  number =       "4",
  pages =        "159--164",
  month =        dec,
  year =         "1974",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:04 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Foster:1974:S,
  author =       "Caxton C. Foster",
  title =        "{SOCRATES}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "3",
  number =       "4",
  pages =        "165--169",
  month =        dec,
  year =         "1974",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:04 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Wann:1974:CCS,
  author =       "Donald F. Wann and Robert A. Ellis",
  title =        "Conjoined computer systems: an architecture for
                 laboratory data processing and instrument control",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "3",
  number =       "4",
  pages =        "170--175",
  month =        dec,
  year =         "1974",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:04 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Jensen:1974:DFC,
  author =       "E. Douglas Jensen",
  title =        "A distributed function computer for real-time
                 control",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "3",
  number =       "4",
  pages =        "176--182",
  month =        dec,
  year =         "1974",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:04 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Radoy:1974:SMI,
  author =       "C. H. Radoy and G. J. Lipovski",
  title =        "Switched multiple instruction, multiple data stream
                 processing",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "3",
  number =       "4",
  pages =        "183--187",
  month =        dec,
  year =         "1974",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:04 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Lechner:1974:SED,
  author =       "Robert J. Lechner",
  title =        "Sequentially encoded data structures that support
                 bidirectional scanning",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "3",
  number =       "4",
  pages =        "188--194",
  month =        dec,
  year =         "1974",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:04 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Freeman:1974:ICE,
  author =       "Martin Freeman",
  title =        "An instruction class for an extensible interpreter",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "3",
  number =       "4",
  pages =        "195--200",
  month =        dec,
  year =         "1974",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:04 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Giloi:1974:SCC,
  author =       "W. K. Giloi and H. Berg",
  title =        "{STARLET}: a computer concept based on ordered sets as
                 primitive data types",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "3",
  number =       "4",
  pages =        "201--206",
  month =        dec,
  year =         "1974",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:04 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Cornell:1974:CGP,
  author =       "R. G. Cornell and H. C. Torng",
  title =        "A cellular general purpose computer",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "3",
  number =       "4",
  pages =        "207--213",
  month =        dec,
  year =         "1974",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:04 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Goldstein:1974:MOR,
  author =       "Barry C. Goldstein and Thomas W. Scrutchin",
  title =        "A machine-oriented resource management architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "3",
  number =       "4",
  pages =        "214--219",
  month =        dec,
  year =         "1974",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:04 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Sloan:1974:DOC,
  author =       "M. E. Sloan",
  title =        "A design-oriented computer engineering program",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "3",
  number =       "4",
  pages =        "220--224",
  month =        dec,
  year =         "1974",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:04 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Baron:1974:ELC,
  author =       "Janis Beitch Baron and D. E. Atkins",
  title =        "An educational laboratory in contemporary digital
                 design",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "3",
  number =       "4",
  pages =        "225--231",
  month =        dec,
  year =         "1974",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:04 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Smith:1975:ACFa,
  author =       "W. R. Smith",
  title =        "{AADC} computer family architecture program",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "4",
  number =       "1",
  pages =        "4--8",
  month =        mar,
  year =         "1975",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Lunde:1975:MDW,
  author =       "{\AA}mund Lunde",
  title =        "More data on the {O/W} ratios: a note on a paper by
                 {Flynn}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "4",
  number =       "1",
  pages =        "9--13",
  month =        mar,
  year =         "1975",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Lipovski:1975:NNA,
  author =       "G. Jack Lipovski and Stanley Y. W. and Sr",
  title =        "On non-numeric architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "4",
  number =       "1",
  pages =        "14--29",
  month =        mar,
  year =         "1975",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Boulaye:1975:SDS,
  author =       "Guy. G. Boulaye",
  title =        "Structured design for structured computer
                 architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "4",
  number =       "2",
  pages =        "8--17",
  month =        jun,
  year =         "1975",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:42 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Parnas:1975:ECA,
  author =       "D. L. Parnas",
  title =        "Evaluation criteria for abstract machines with unknown
                 applications",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "4",
  number =       "3",
  pages =        "2--9",
  month =        sep,
  year =         "1975",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:02 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "Special issue: The AADC computer family architecture
                 project",
}

@Article{Smith:1975:ACFb,
  author =       "William R. Smith",
  title =        "{AADC} computer family architecture questions and
                 answers",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "4",
  number =       "3",
  pages =        "15--21",
  month =        sep,
  year =         "1975",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:02 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "Special issue: The AADC computer family architecture
                 project",
}

@Article{Su:1975:ICC,
  author =       "Stephen Y. H. Su",
  title =        "An introduction to {CHDL} (computer hardware
                 description languages)",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "4",
  number =       "3",
  pages =        "22--23",
  month =        sep,
  year =         "1975",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:02 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Doran:1975:ICL,
  author =       "R. W. Doran",
  title =        "The {International Computers Ltd. ICL2900} computer
                 architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "4",
  number =       "3",
  pages =        "24--47",
  month =        sep,
  year =         "1975",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:02 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Bell:1976:CSW,
  author =       "Gordon Bell and William D. Strecker",
  title =        "Computer structures: {What} have we learned from the
                 {PDP-11}?",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "4",
  number =       "4",
  pages =        "1--14",
  month =        jan,
  year =         "1976",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:06 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kerner:1976:PLL,
  author =       "Helmut Kerner and Werner Beyerle",
  title =        "A {PMS} level language for performance evaluation
                 modelling {(V-PMS)}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "4",
  number =       "4",
  pages =        "15--19",
  month =        jan,
  year =         "1976",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:06 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Moalla:1976:DTM,
  author =       "M. Moalla and G. Saucier and J. Sifakis and M.
                 Zachariades",
  title =        "A design tool for the multilevel description and
                 simulation of systems of interconnected modules",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "4",
  number =       "4",
  pages =        "20--27",
  month =        jan,
  year =         "1976",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:06 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Allen:1976:CCS,
  author =       "Jonathan Allen",
  title =        "A course in computer structures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "4",
  number =       "4",
  pages =        "28--32",
  month =        jan,
  year =         "1976",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:06 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Rossmann:1976:ICS,
  author =       "George E. Rossmann",
  title =        "The {IEEE Computer Society} task force on computer
                 architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "4",
  number =       "4",
  pages =        "33--33",
  month =        jan,
  year =         "1976",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:06 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Widdoes:1976:MMM,
  author =       "Lawrence C. {Widdoes, Jr.}",
  title =        "The {Minerva} multi-microprocessor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "4",
  number =       "4",
  pages =        "34--39",
  month =        jan,
  year =         "1976",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:06 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Arnold:1976:HRM,
  author =       "R. G. Arnold and E. W. Page",
  title =        "A hierarchical, restructurable multi-microprocessor
                 architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "4",
  number =       "4",
  pages =        "40--45",
  month =        jan,
  year =         "1976",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:06 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{McGill:1976:MAN,
  author =       "Robert McGill and John Steinhoff",
  title =        "A multimicroprocessor approach to numerical analysis:
                 {An} application to gaming problems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "4",
  number =       "4",
  pages =        "46--51",
  month =        jan,
  year =         "1976",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:06 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Jensen:1976:MIS,
  author =       "John E. Jensen and Jean-Loup Baer",
  title =        "A model of interference in a shared resource
                 multiprocessor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "4",
  number =       "4",
  pages =        "52--57",
  month =        jan,
  year =         "1976",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:06 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Leung:1976:CSF,
  author =       "Clement K. C. Leung and David P. Misunas and Andrij
                 Neczwid and Jack B. Dennis",
  title =        "A computer simulation facility for packet
                 communication architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "4",
  number =       "4",
  pages =        "58--63",
  month =        jan,
  year =         "1976",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:06 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Rege:1976:CPS,
  author =       "S. L. Rege",
  title =        "Cost, performance and size tradeoffs for different
                 levels in a memory hierarchy",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "4",
  number =       "4",
  pages =        "64--67",
  month =        jan,
  year =         "1976",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:06 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Dworak:1976:IIR,
  author =       "Paul E. Dworak and Alice C. Parker",
  title =        "An input interface for a real-time digital sound
                 generation system",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "4",
  number =       "4",
  pages =        "68--73",
  month =        jan,
  year =         "1976",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:06 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Mulder:1976:MOD,
  author =       "Michael C. Mulder and Patrick P. Fasang",
  title =        "A microprocessor oriented data acquisition and control
                 system for power system control",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "4",
  number =       "4",
  pages =        "74--78",
  month =        jan,
  year =         "1976",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:06 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Gladney:1976:MRT,
  author =       "H. M. Gladney and G. Hochweller",
  title =        "Multiprogramming for real-time applications",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "4",
  number =       "4",
  pages =        "79--85",
  month =        jan,
  year =         "1976",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:06 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kehl:1976:BAH,
  author =       "Theodore H. Kehl",
  title =        "{Basil} architecture --- an {HLL} minicomputer",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "4",
  number =       "4",
  pages =        "86--92",
  month =        jan,
  year =         "1976",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:06 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Lawson:1976:FDC,
  author =       "Harold W. {Lawson, Jr.}",
  title =        "Function distribution in computer system
                 architectures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "4",
  number =       "4",
  pages =        "93--97",
  month =        jan,
  year =         "1976",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:06 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Vissers:1976:IDA,
  author =       "Chris A. Vissers",
  title =        "Interface, a dispersed architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "4",
  number =       "4",
  pages =        "98--104",
  month =        jan,
  year =         "1976",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:06 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Thomasian:1976:DSS,
  author =       "A. Thomasian and A. Avizienis",
  title =        "A design study of a shared resource computing system",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "4",
  number =       "4",
  pages =        "105--112",
  month =        jan,
  year =         "1976",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:06 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ford:1976:HSI,
  author =       "W. S. Ford and V. C. Hamacher",
  title =        "Hardware support for inter-process communication and
                 processor sharing",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "4",
  number =       "4",
  pages =        "113--118",
  month =        jan,
  year =         "1976",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:06 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Trambacz:1976:TDP,
  author =       "Ulrich Trambacz and Georg Hyla",
  title =        "A taxonomy of display processors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "4",
  number =       "4",
  pages =        "119--120",
  month =        jan,
  year =         "1976",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:06 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kluge:1976:TBT,
  author =       "W. E. Kluge",
  title =        "Traversing binary tree structures with shift register
                 memories (recent results)",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "4",
  number =       "4",
  pages =        "121.1--121.1",
  month =        jan,
  year =         "1976",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:06 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Fernandez:1976:ASS,
  author =       "Eduardo B. Fernandez and Rita C. Summers and Charles
                 D. Coleman",
  title =        "Architectural support for system protection (recent
                 results)",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "4",
  number =       "4",
  pages =        "121.2--121.2",
  month =        jan,
  year =         "1976",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:06 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Gault:1976:DUP,
  author =       "James W. Gault and Alice C. Parker",
  title =        "The design of a user-programmable digital interface
                 (recent results)",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "4",
  number =       "4",
  pages =        "121.3--121.3",
  month =        jan,
  year =         "1976",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:06 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Fournier:1976:SDG,
  author =       "Serge Fournier and Ming T. Liu",
  title =        "System design of a grammar-programmable high-level
                 language machine",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "4",
  number =       "4",
  pages =        "122.4--122.4",
  month =        jan,
  year =         "1976",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:06 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kuznia:1976:SSM,
  author =       "Ch. Kuznia and R. Kober and H. Kopp",
  title =        "{SMS 101} --- a structured multi microprocessor
                 system with deadlock-free operation scheme",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "4",
  number =       "4",
  pages =        "122.5--122.5",
  month =        jan,
  year =         "1976",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:06 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Liu:1976:SSD,
  author =       "Philip S. Liu and Frederic J. Mowle",
  title =        "Selection schemes for dynamically microcoding
                 {Fortran} programs",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "4",
  number =       "4",
  pages =        "122.6--122.6",
  month =        jan,
  year =         "1976",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:06 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Fuller:1976:DMM,
  author =       "S. H. Fuller and D. P. Siewiorek and R. J. Swan",
  title =        "The design of a multi-micro-computer system",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "4",
  number =       "4",
  pages =        "123--123",
  month =        jan,
  year =         "1976",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:06 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Reames:1976:DSD,
  author =       "Cecil C. Reames and Ming T. Liu",
  title =        "Design and simulation of the distributed loop computer
                 network {(DLCN)}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "4",
  number =       "4",
  pages =        "124--129",
  month =        jan,
  year =         "1976",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:06 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Franchi:1976:DFC,
  author =       "Paolo Franchi",
  title =        "Distribution of functions and control in {RPCNET}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "4",
  number =       "4",
  pages =        "130--135",
  month =        jan,
  year =         "1976",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:06 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Wittie:1976:EMR,
  author =       "Larry D. Wittie",
  title =        "Efficient message routing in {Mega-Micro-Computer}
                 networks",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "4",
  number =       "4",
  pages =        "136--140",
  month =        jan,
  year =         "1976",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:06 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Welch:1976:IDO,
  author =       "Terry A. Welch",
  title =        "An investigation of descriptor oriented architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "4",
  number =       "4",
  pages =        "141--146",
  month =        jan,
  year =         "1976",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:06 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Feustel:1976:TAS,
  author =       "E. A. Feustel",
  title =        "Tagged architecture and the semantics of programming
                 languages: {Extensible} types",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "4",
  number =       "4",
  pages =        "147--150",
  month =        jan,
  year =         "1976",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:06 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Batson:1976:DDA,
  author =       "A. P. Batson and R. E. Brundage and J. P. Kearns",
  title =        "Design data for {Algol-60} machines",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "4",
  number =       "4",
  pages =        "151--154",
  month =        jan,
  year =         "1976",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:06 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Strecker:1976:CMP,
  author =       "William D. Strecker",
  title =        "Cache memories for {PDP-11} family computers",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "4",
  number =       "4",
  pages =        "155--158",
  month =        jan,
  year =         "1976",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:06 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Patel:1976:ITP,
  author =       "Janak H. Patel and Edward S. Davidson",
  title =        "Improving the throughput of a pipeline by insertion of
                 delays",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "4",
  number =       "4",
  pages =        "159--164",
  month =        jan,
  year =         "1976",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:06 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Abd-Alla:1976:LAT,
  author =       "A. M. Abd-Alla and Laird H. Moffett",
  title =        "On-line architecture tuning using microcapture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "4",
  number =       "4",
  pages =        "165--171",
  month =        jan,
  year =         "1976",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:06 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Healy:1976:COC,
  author =       "Leonard D. Healy",
  title =        "A character-oriented context-addressed
                 segment-sequential storage",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "4",
  number =       "4",
  pages =        "172--177",
  month =        jan,
  year =         "1976",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:06 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Bush:1976:SIS,
  author =       "J. A. Bush and G. J. Lipovski and S. Y. W. su and J.
                 K. Watson and S. J. Ackerman",
  title =        "Some implementations of segment sequential functions",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "4",
  number =       "4",
  pages =        "178--185",
  month =        jan,
  year =         "1976",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:06 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{DeMartinis:1976:SMS,
  author =       "Manlio DeMartinis and G. Jack Lipovski and Stanley Y.
                 W. Su and J. K. Watson",
  title =        "A {Self Managing Secondary Memory} system",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "4",
  number =       "4",
  pages =        "186--194",
  month =        jan,
  year =         "1976",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:06 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Fuller:1976:PPC,
  author =       "Samuel H. Fuller",
  title =        "Price\slash performance comparison of {C.mmp} and the
                 {PDP-10}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "4",
  number =       "4",
  pages =        "195--202",
  month =        jan,
  year =         "1976",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:06 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Thorelli:1976:RAC,
  author =       "Lars-Erik Thorelli",
  title =        "Representation of arrays in computers",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "5",
  number =       "1",
  pages =        "6--9",
  month =        apr,
  year =         "1976",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:26 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Berndt:1976:ECA,
  author =       "Helmut Berndt",
  title =        "Evolutionary computer architecture: the {Unidata
                 7.000} series",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "5",
  number =       "1",
  pages =        "10--16",
  month =        apr,
  year =         "1976",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:26 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Dennis:1976:CAC,
  author =       "Jack B. Dennis",
  title =        "Computer architecture and the cost of software",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "5",
  number =       "1",
  pages =        "17--21",
  month =        apr,
  year =         "1976",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:26 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Lindamood:1976:NCA,
  author =       "George Lindamood",
  title =        "On navel contemplation and the art of computer
                 maintenance",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "5",
  number =       "1",
  pages =        "22--23",
  month =        apr,
  year =         "1976",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:26 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Fuller:1976:IMS,
  author =       "S. H. Fuller and G. A. Mathew",
  title =        "Implementing microprogram storage with {PLA}'s",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "5",
  number =       "2",
  pages =        "6--11",
  month =        jun,
  year =         "1976",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:42 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hicks:1976:GQS,
  author =       "D. R. Hicks",
  title =        "A generalized queue scheme for process synchronization
                 and communication",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "5",
  number =       "2",
  pages =        "12--14",
  month =        jun,
  year =         "1976",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:42 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Langdon:1976:BRR,
  author =       "Glen G. Langdon",
  title =        "Book reviews: Review of {{\em Introduction to Computer
                 Architecture\/}} by {Harold S. Stone}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "5",
  number =       "2",
  pages =        "17--19",
  month =        jun,
  year =         "1976",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:42 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

%%% TO DO: [04-Sep-2014] Volume 5 number 3: no data yet in ACM Portal database

@Article{Thurber:1976:ANR,
  author =       "Kenneth J. Thurber",
  title =        "{ARPS}: a new real-time computer",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "5",
  number =       "4",
  pages =        "6--16",
  month =        oct,
  year =         "1976",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:09 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Salisbury:1976:MMC,
  author =       "Alan B. Salisbury",
  title =        "{MCF}: a military computer family for computer-based
                 systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "5",
  number =       "4",
  pages =        "17--20",
  month =        oct,
  year =         "1976",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:09 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ris:1976:UDF,
  author =       "Frederic N. Ris",
  title =        "A unified decimal floating-point architecture for the
                 support of high-level languages",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "5",
  number =       "4",
  pages =        "21--31",
  month =        oct,
  year =         "1976",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:09 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Lipovski:1976:QS,
  author =       "G. Jack Lipovski",
  title =        "A question of style",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "5",
  number =       "4",
  pages =        "32--38",
  month =        oct,
  year =         "1976",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:09 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Chroust:1976:DIV,
  author =       "G. Chroust",
  title =        "Data interfaces versus control interfaces: a
                 half-baked conjecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "5",
  number =       "4",
  pages =        "39--40",
  month =        oct,
  year =         "1976",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:09 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

%%% TO DO: [04-Sep-2014] Volume 5 number 5: no data yet in ACM Portal database

@Article{Langdon:1977:CFM,
  author =       "Glen G. Langdon",
  title =        "Considerations on the ``figure of merit'' technique
                 for storage hierarchy design",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "5",
  number =       "6",
  pages =        "25--28",
  month =        feb,
  year =         "1977",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:28 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Miller:1977:BRRb,
  author =       "Edward F. Miller",
  title =        "Book Reviews: Review of {{\em High-Level Language
                 Computer Architecture\/}} by {Yaohan Chu. Academic
                 Press, New York, 1975}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "5",
  number =       "6",
  pages =        "29--29",
  month =        feb,
  year =         "1977",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:28 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Chu:1977:AHD,
  author =       "Yaohan Chu",
  title =        "Architecture of a hardware data interpreter",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "5",
  number =       "7",
  pages =        "1--9",
  month =        mar,
  year =         "1977",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Dasgupta:1977:DSL,
  author =       "Subrata Dasgupta",
  title =        "The design of some language constructs for horizontal
                 microprogramming",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "5",
  number =       "7",
  pages =        "10--16",
  month =        mar,
  year =         "1977",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Jensen:1977:HMM,
  author =       "E. Douglas Jensen and Richard Y. Kain",
  title =        "The {Honeywell Modular Microprogram Machine}: {M3}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "5",
  number =       "7",
  pages =        "17--28",
  month =        mar,
  year =         "1977",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ramseyer:1977:MMI,
  author =       "Richard R. Ramseyer and Andries van Dam",
  title =        "A multi-microprocessor implementation of a general
                 purpose pipelined {CPU}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "5",
  number =       "7",
  pages =        "29--34",
  month =        mar,
  year =         "1977",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ravi:1977:HMS,
  author =       "C. V. Ravi and Torben Moller",
  title =        "A hierarchical microcomputer system for hardware and
                 software development",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "5",
  number =       "7",
  pages =        "35--40",
  month =        mar,
  year =         "1977",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Harris:1977:HMO,
  author =       "J. Archer Harris and David R. Smith",
  title =        "Hierarchical multiprocessor organizations",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "5",
  number =       "7",
  pages =        "41--48",
  month =        mar,
  year =         "1977",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hurakami:1977:PPS,
  author =       "K. Hurakami and S. Nishikawa and M. Sato",
  title =        "Poly-Processor {System} analysis and design",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "5",
  number =       "7",
  pages =        "49--56",
  month =        mar,
  year =         "1977",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Mazare:1977:FEH,
  author =       "Guy Mazare",
  title =        "A few examples of how to use a symmetrical
                 multi-micro-processor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "5",
  number =       "7",
  pages =        "57--62",
  month =        mar,
  year =         "1977",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kogge:1977:MPP,
  author =       "Peter M. Kogge",
  title =        "The microprogramming of pipelined processors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "5",
  number =       "7",
  pages =        "63--69",
  month =        mar,
  year =         "1977",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Siegel:1977:UVT,
  author =       "Howard Jay Siegel",
  title =        "The universality of various types of {SIMD} machine
                 interconnection networks",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "5",
  number =       "7",
  pages =        "70--79",
  month =        mar,
  year =         "1977",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Rau:1977:EIF,
  author =       "Ramakrishna B. Rau and George E. Rossmann",
  title =        "The effect of instruction fetch strategies upon the
                 performance of pipelined instruction units",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "5",
  number =       "7",
  pages =        "80--89",
  month =        mar,
  year =         "1977",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ahuja:1977:MMS,
  author =       "S. R. Ahuja and J. R. Jump",
  title =        "A modular memory scheme for array processing",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "5",
  number =       "7",
  pages =        "90--94",
  month =        mar,
  year =         "1977",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Haynes:1977:AAC,
  author =       "Leonard S. Haynes",
  title =        "The architecture of an {ALGOL 60} computer implemented
                 with distributed processors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "5",
  number =       "7",
  pages =        "95--104",
  month =        mar,
  year =         "1977",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Sullivan:1977:LSHa,
  author =       "Herbert Sullivan and T. R. Bashkow",
  title =        "A large scale, homogeneous, fully distributed parallel
                 machine, {I}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "5",
  number =       "7",
  pages =        "105--117",
  month =        mar,
  year =         "1977",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Sullivan:1977:LSHb,
  author =       "Herbert Sullivan and Theodore R. Bashkow and David
                 Klappholz",
  title =        "A Large Scale, Homogeneous, Fully Distributed Parallel
                 Machine, {II}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "5",
  number =       "7",
  pages =        "118--124",
  month =        mar,
  year =         "1977",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Lipovski:1977:VMM,
  author =       "G. Jack Lipovski",
  title =        "On virtual memories and micronetworks",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "5",
  number =       "7",
  pages =        "125--134",
  month =        mar,
  year =         "1977",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Strauss:1977:CNT,
  author =       "Jon C. Strauss and Kenneth J. Thurber",
  title =        "Considerations for new tactical computer systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "5",
  number =       "7",
  pages =        "135--140",
  month =        mar,
  year =         "1977",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Thurber:1977:ATC,
  author =       "Kenneth J. Thurber and Peter C. Patton and Robert C.
                 Deward and Jon C. Strauss and Thomas W. Petschauer",
  title =        "An advanced tactical computer concept",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "5",
  number =       "7",
  pages =        "141--146",
  month =        mar,
  year =         "1977",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Nutt:1977:MIP,
  author =       "Gary J. Nutt",
  title =        "Microprocessor implementation of a parallel
                 processor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "5",
  number =       "7",
  pages =        "147--152",
  month =        mar,
  year =         "1977",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Dworak:1977:DIR,
  author =       "Paul Dworak and Alice C. Parker and Richard Blum",
  title =        "The design and implementation of a real-time sound
                 generation system",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "5",
  number =       "7",
  pages =        "153--158",
  month =        mar,
  year =         "1977",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Parker:1977:HST,
  author =       "A. C. Parker and A. W. Nagle",
  title =        "Hardware\slash software tradeoffs in a variable word
                 width, variable queue length buffer memory",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "5",
  number =       "7",
  pages =        "159--164",
  month =        mar,
  year =         "1977",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Peuto:1977:ITM,
  author =       "Bernard L. Peuto and Leonard J. Shustek",
  title =        "An instruction timing model of {CPU} performance",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "5",
  number =       "7",
  pages =        "165--178",
  month =        mar,
  year =         "1977",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hoogendoorn:1977:RMI,
  author =       "Cornelis H. Hoogendoorn",
  title =        "Reduction of memory interference in multiprocessor
                 systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "5",
  number =       "7",
  pages =        "179--183",
  month =        mar,
  year =         "1977",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hammerstrom:1977:ICC,
  author =       "D. W. Hammerstrom and E. S. Davidson",
  title =        "Information content of {CPU} memory referencing
                 behavior",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "5",
  number =       "7",
  pages =        "184--192",
  month =        mar,
  year =         "1977",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Liu:1977:MCP,
  author =       "Ming T. Liu and Cecil C. Reames",
  title =        "Message communication protocol and operating system
                 design for the {Distributed Loop Computer Network
                 (DLCN)}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "5",
  number =       "7",
  pages =        "193--200",
  month =        mar,
  year =         "1977",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Poujoulat:1977:ACB,
  author =       "G. H. Poujoulat",
  title =        "Architecture of the {CORAIL} building block system",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "5",
  number =       "7",
  pages =        "201--204",
  month =        mar,
  year =         "1977",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Tredennick:1977:HSB,
  author =       "H. L. Tredennick and T. A. Welch",
  title =        "High-speed buffering for variable length operands",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "5",
  number =       "7",
  pages =        "205--210",
  month =        mar,
  year =         "1977",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Steel:1977:AGP,
  author =       "Rod Steel",
  title =        "Another general purpose computer architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "5",
  number =       "8",
  pages =        "5--11",
  month =        apr,
  year =         "1977",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:31 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Lindamood:1977:WN,
  author =       "George E. Lindamood",
  title =        "What's in a name?",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "5",
  number =       "8",
  pages =        "12--14",
  month =        apr,
  year =         "1977",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:31 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Schneiker:1977:MF,
  author =       "Conrad Schneiker",
  title =        "The microprocessors of the future",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "5",
  number =       "8",
  pages =        "15--16",
  month =        apr,
  year =         "1977",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:31 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Miller:1977:BRR,
  author =       "Edward F. {Miller, Jr.}",
  title =        "Book review: Review of {{\em Large-Scale Computer
                 Architecture: Parallel and Associative Processors\/}}
                 by {Kenneth J. Thurber, Hayden Book Company, Rochelle
                 Park, New Jersey 1976}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "5",
  number =       "8",
  pages =        "17--17",
  month =        apr,
  year =         "1977",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:31 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Conner:1977:IOC,
  author =       "William M. Conner and Edward R. Dirling",
  title =        "Input\slash Output considerations in look-ahead
                 processing",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "6",
  number =       "1",
  pages =        "7--12",
  month =        jun,
  year =         "1977",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:27 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Rosin:1977:SM,
  author =       "Robert F. Rosin",
  title =        "The significance of microprogramming",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "6",
  number =       "1",
  pages =        "14--19",
  month =        jun,
  year =         "1977",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:27 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Gonzalez:1977:BRR,
  author =       "Mario J. Gonzalez",
  title =        "Book review: Review of {{\em Microprogramming
                 Primer\/}} by {Harry Katzan, Jr., McGraw-Hill 1977}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "6",
  number =       "1",
  pages =        "29--30",
  month =        jun,
  year =         "1977",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:27 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Vineberg:1977:ICS,
  author =       "Maniel Vineberg",
  title =        "Implementation of character string pattern matching on
                 a multiprocessor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "6",
  number =       "2",
  pages =        "1--7",
  month =        may,
  year =         "1977",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:45 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Bird:1977:APP,
  author =       "R. M. Bird and J. C. Tu and R. M. Worthy",
  title =        "Associative\slash parallel processors for searching
                 very large textual data bases",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "6",
  number =       "2",
  pages =        "8--9",
  month =        may,
  year =         "1977",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:45 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Lipovski:1977:IFT,
  author =       "G. J. Lipovski",
  title =        "On imaginary fields, token transfers and floating
                 codes in intelligent secondary memories",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "6",
  number =       "2",
  pages =        "17--22",
  month =        may,
  year =         "1977",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:45 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Zaky:1977:MNN,
  author =       "S. G. Zaky",
  title =        "Microprocessors for non-numeric processing",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "6",
  number =       "2",
  pages =        "23--30",
  month =        may,
  year =         "1977",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:45 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hsiao:1977:ADC,
  author =       "David K. Hsiao and Krishnamurthi Kannan",
  title =        "The architecture of a database computer --- a
                 summary",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "6",
  number =       "2",
  pages =        "31--33",
  month =        may,
  year =         "1977",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:45 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Rosenthal:1977:DMM,
  author =       "Robert S. Rosenthal",
  title =        "The data management machine, a classification",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "6",
  number =       "2",
  pages =        "35--39",
  month =        may,
  year =         "1977",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:45 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{McDonell:1977:TNS,
  author =       "Ken J. McDonell",
  title =        "Trends in non-software support for input-output
                 functions",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "6",
  number =       "2",
  pages =        "40--47",
  month =        may,
  year =         "1977",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:45 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Cerretti:1977:UIP,
  author =       "R. Cerretti and D. Jasilli and D. R. Matteucci",
  title =        "{Ulisse}: {An Italian} project for a multifunctional
                 terminal system",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "6",
  number =       "2",
  pages =        "48--50",
  month =        may,
  year =         "1977",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:45 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Bray:1977:DMR,
  author =       "Olin H. Bray",
  title =        "Data management requirements: {The} similarity of
                 memory management, database systems, and message
                 processing",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "6",
  number =       "2",
  pages =        "68--76",
  month =        may,
  year =         "1977",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:45 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Landson:1977:CSA,
  author =       "Barry M. Landson and Robert G. Sargent",
  title =        "A comparison of sequential and associate computing of
                 priority queues",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "6",
  number =       "2",
  pages =        "77--78",
  month =        may,
  year =         "1977",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:45 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Myers:1977:CAS,
  author =       "Glenford J. Myers",
  title =        "The case against stack-oriented instruction sets",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "6",
  number =       "3",
  pages =        "7--10",
  month =        aug,
  year =         "1977",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:52 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Tanenbaum:1977:AMA,
  author =       "Andrew S. Tanenbaum",
  title =        "Ambiguous machine architecture and program
                 efficiency",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "6",
  number =       "3",
  pages =        "11--13",
  month =        aug,
  year =         "1977",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:52 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hicks:1977:MCA,
  author =       "D. R. Hicks",
  title =        "Microprogramming with a content-addressable
                 read-only-memory",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "6",
  number =       "3",
  pages =        "14--15",
  month =        aug,
  year =         "1977",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:52 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hicks:1977:MPS,
  author =       "D. R. Hicks",
  title =        "Multitasking as a program structuring primitive",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "6",
  number =       "3",
  pages =        "16--18",
  month =        aug,
  year =         "1977",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:52 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Chroust:1977:BRR,
  author =       "G. Chroust",
  title =        "Book reviews: Review of {{\em Digital System
                 Implementation\/}} by {Gerrit A. Blaauw, Prentice Hall,
                 Series in Automatic Computation 1976}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "6",
  number =       "4",
  pages =        "27--28",
  month =        oct,
  year =         "1977",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:09 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hagan:1977:VMS,
  author =       "R. A. Hagan and C. S. Wallace",
  title =        "A virtual memory system for the {Hewlett Packard
                 2100A}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "6",
  number =       "5",
  pages =        "5--13",
  month =        dec,
  year =         "1977",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:17 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Baskett:1977:MMF,
  author =       "Forest Baskett",
  title =        "More on microprocessors of the future",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "6",
  number =       "5",
  pages =        "14--17",
  month =        dec,
  year =         "1977",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:17 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Chu:1977:DEC,
  author =       "Yaohan Chu",
  title =        "Direct-execution computer architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "6",
  number =       "5",
  pages =        "18--23",
  month =        dec,
  year =         "1977",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:17 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Schulthess:1977:RCA,
  author =       "Peter U. Schulthess and Eduard P. Mumprecht",
  title =        "Reply to the case against stack-oriented instruction
                 sets",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "6",
  number =       "5",
  pages =        "24--27",
  month =        dec,
  year =         "1977",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:17 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Mountain:1978:AMC,
  author =       "John B. Mountain and Philip H. Enslow",
  title =        "Application of the military computer family
                 architecture selection criteria to the {PR1ME P400}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "6",
  number =       "6",
  pages =        "3--17",
  month =        feb,
  year =         "1978",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:28 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Lipovski:1978:JFM,
  author =       "G. Jack Lipovski",
  title =        "Just a few more words on microprocessors of the
                 future",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "6",
  number =       "6",
  pages =        "18--21",
  month =        feb,
  year =         "1978",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:28 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Keedy:1978:USE,
  author =       "J. L. Keedy",
  title =        "On the use of stacks in the evaluation of
                 expressions",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "6",
  number =       "6",
  pages =        "22--28",
  month =        feb,
  year =         "1978",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:28 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Tanenbaum:1978:RPA,
  author =       "Andrew S. Tanenbaum",
  title =        "Review of {{\em Processor Architecture\/}} by {S. H.
                 Lavington, NCC Publications, Manchester 1976}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "6",
  number =       "6",
  pages =        "31--31",
  month =        feb,
  year =         "1978",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:28 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Whiteside:1978:BRR,
  author =       "A. E. Whiteside",
  title =        "Book reviews: Review of {{\em The Architecture of
                 Concurrent Programs\/}} by {Per Brinch Hansen,
                 Prentice-Hall 1977}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "6",
  number =       "6",
  pages =        "32--32",
  month =        feb,
  year =         "1978",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:28 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Bhandarkar:1978:STT,
  author =       "Dileep P. Bhandarkar and J. Egil Juliussen",
  title =        "Semiconductor technology: trends and implications",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "7",
  number =       "1",
  pages =        "4--14",
  month =        aug,
  year =         "1978",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:27 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Payne:1978:CCD,
  author =       "A. J. Payne",
  title =        "A computer console design to help the operator",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "7",
  number =       "1",
  pages =        "15--22",
  month =        aug,
  year =         "1978",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:27 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{McGlynn:1978:RCA,
  author =       "Daniel R. McGlynn",
  title =        "Review of {{\em Content Addressable Parallel
                 Processors\/}} by {Caxton C. Foster. Van Nostrand
                 Reinhold Co. 1976}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "7",
  number =       "1",
  pages =        "23--23",
  month =        aug,
  year =         "1978",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:27 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ramamoorthy:1978:RSC,
  author =       "C. V. Ramamoorthy",
  title =        "Review of {{\em Structured Computer Organization\/}}
                 by {Andrew S. Tanenbaum, Prentice-Hall 1976}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "7",
  number =       "1",
  pages =        "23--23",
  month =        aug,
  year =         "1978",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:27 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Buchholz:1978:RCS,
  author =       "W. Buchholz",
  title =        "Review of {{\em Computer System Architecture\/}} by
                 {M. Morris Mano, Prentice-Hall 1976}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "7",
  number =       "1",
  pages =        "24--24",
  month =        aug,
  year =         "1978",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:27 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Vranesic:1978:BRR,
  author =       "Z. G. Vranesic",
  title =        "Book reviews: Review of {{\em Content Addressable
                 Parallel Processors\/}} by {Caxton C. Foster, Van
                 Nostrand Reinhold Co. 1976}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "7",
  number =       "1",
  pages =        "24--24",
  month =        aug,
  year =         "1978",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:27 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Korfhage:1978:DPU,
  author =       "R. R. Korfhage and W. H. E. Day and L. L. Beck and W.
                 F. Appelbe",
  title =        "Data physics: an unorthodox view of data and its
                 implications in data processors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "7",
  number =       "2",
  pages =        "1--7",
  month =        aug,
  year =         "1978",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:41 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Copeland:1978:SSS,
  author =       "George P. Copeland",
  title =        "String storage and searching for data base
                 applications: implementation on the {INDY} backend
                 kernel",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "7",
  number =       "2",
  pages =        "8--17",
  month =        aug,
  year =         "1978",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:41 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Otis:1978:ERD,
  author =       "Allen J. Otis and George P. Copeland",
  title =        "Editing requirements for data base applications and
                 their implementation on the {INDY} backend kernel",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "7",
  number =       "2",
  pages =        "18--29",
  month =        aug,
  year =         "1978",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:41 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Lipovski:1978:SPI,
  author =       "G. Jack Lipovski",
  title =        "Semantic paging on intelligent discs",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "7",
  number =       "2",
  pages =        "30--34",
  month =        aug,
  year =         "1978",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:41 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Williams:1978:MSD,
  author =       "Rhon Williams",
  title =        "A multiprocessing system for the direct execution of
                 {LISP}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "7",
  number =       "2",
  pages =        "35--41",
  month =        aug,
  year =         "1978",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:41 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Bird:1978:TFI,
  author =       "R. M. Bird and J. B. Newsbaum and J. L. Trefftzs",
  title =        "Text file inversion: an evaluation",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "7",
  number =       "2",
  pages =        "42--50",
  month =        aug,
  year =         "1978",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:41 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Roberts:1978:SCA,
  author =       "David C. Roberts",
  title =        "A specialized computer architecture for text
                 retrieval",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "7",
  number =       "2",
  pages =        "51--59",
  month =        aug,
  year =         "1978",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:41 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Stucki:1978:CCA,
  author =       "M. J. Stucki and J. R. Cox and G. C. Roman and P. N.
                 Turcu",
  title =        "Coordinating concurrent access in a distributed
                 database architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "7",
  number =       "2",
  pages =        "60--64",
  month =        aug,
  year =         "1978",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:41 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Gouda:1978:HCC,
  author =       "Mohamed G. Gouda",
  title =        "A hierarchical controller for concurrent accessing of
                 distributed databases",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "7",
  number =       "2",
  pages =        "65--70",
  month =        aug,
  year =         "1978",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:41 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Gavish:1978:EAD,
  author =       "Bezalel Gavish and Harvey Koch",
  title =        "An extensible architecture for data flow processing",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "7",
  number =       "2",
  pages =        "71--76",
  month =        aug,
  year =         "1978",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:41 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Harvill:1978:FPO,
  author =       "J. B. Harvill",
  title =        "Functional parallelism in an operand state saving
                 computer",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "7",
  number =       "2",
  pages =        "77--84",
  month =        aug,
  year =         "1978",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:41 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hutchison:1978:MM,
  author =       "J. S. Hutchison and W. G. Roman",
  title =        "Madman machine",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "7",
  number =       "2",
  pages =        "85--90",
  month =        aug,
  year =         "1978",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:41 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Banerjee:1978:UDM,
  author =       "Jayanta Banerjee and David K. Hsiao",
  title =        "The use of a database machine for supporting
                 relational databases",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "7",
  number =       "2",
  pages =        "91--98",
  month =        aug,
  year =         "1978",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:41 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Sadowski:1978:EPR,
  author =       "Paul J. Sadowski and S. A. Schuster",
  title =        "Exploiting parallelism in a {Relational Associative
                 Processor}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "7",
  number =       "2",
  pages =        "99--109",
  month =        aug,
  year =         "1978",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:41 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Chang:1978:BRD,
  author =       "Hsu Chang",
  title =        "Bubbles for relational database",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "7",
  number =       "2",
  pages =        "110--116",
  month =        aug,
  year =         "1978",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:41 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{ElMasri:1978:MIR,
  author =       "A. {El Masri} and J. Rohmer and D. Tusera",
  title =        "A machine for information retrieval",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "7",
  number =       "2",
  pages =        "117--120",
  month =        aug,
  year =         "1978",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:41 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Matteucci:1978:DSA,
  author =       "Dante R. Matteucci",
  title =        "A distributed structure for the automization of the
                 {Catalog of the National Cultural Heritage}:
                 experiences and proposals",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "7",
  number =       "2",
  pages =        "121--133",
  month =        aug,
  year =         "1978",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:41 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Thurber:1978:CCT,
  author =       "Kenneth J. Thurber",
  title =        "Computer communication techniques",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "7",
  number =       "3",
  pages =        "7--16",
  month =        oct,
  year =         "1978",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:02 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Jennings:1978:VP,
  author =       "Hal W. Jennings",
  title =        "A variation on the {PDP 11}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "7",
  number =       "3",
  pages =        "17--26",
  month =        oct,
  year =         "1978",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:02 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hansen:1978:MAC,
  author =       "Per Brinch Hansen",
  title =        "Multiprocessor architectures for concurrent programs",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "7",
  number =       "4",
  pages =        "4--23",
  month =        dec,
  year =         "1978",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:05 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Keedy:1978:EEU,
  author =       "J. L. Keedy",
  title =        "On the evaluation of expressions using accumulators,
                 stacks and store-to-store instructions",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "7",
  number =       "4",
  pages =        "24--27",
  month =        dec,
  year =         "1978",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:05 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Chattergy:1978:CL,
  author =       "Rahul Chattergy",
  title =        "In the current literature",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "7",
  number =       "4",
  pages =        "30--30",
  month =        dec,
  year =         "1978",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:05 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Cragon:1979:ECS,
  author =       "Harvey G. Cragon",
  title =        "An evaluation of code space requirements and
                 performance of various architectures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "7",
  number =       "5",
  pages =        "5--21",
  month =        feb,
  year =         "1979",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:24 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Thurber:1979:BLC,
  author =       "Kenneth J. Thurber and Harvey A. Freeman",
  title =        "A bibliography of local computer network
                 architectures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "7",
  number =       "5",
  pages =        "22--27",
  month =        feb,
  year =         "1979",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:24 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Cox:1979:NCA,
  author =       "Lyle A. {Cox, Jr.}",
  title =        "The nature of ``computer architecture''",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "7",
  number =       "7",
  pages =        "8--12",
  month =        apr,
  year =         "1979",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:29 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{vandeSnepscheut:1979:INP,
  author =       "Jan L. A. van de Snepscheut and Gert A. Slavenburg",
  title =        "Introducing the notion of processes to hardware",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "7",
  number =       "7",
  pages =        "13--23",
  month =        apr,
  year =         "1979",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:29 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Atkins:1979:RAC,
  author =       "D. E. Atkins",
  title =        "Review of {{\em Advances in Computer Architecture\/}}
                 by {Glenford J. Myers. Wiley-Interscience Division of
                 John Wiley and Sons 1978}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "7",
  number =       "7",
  pages =        "25--26",
  month =        apr,
  year =         "1979",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:29 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Bowyer:1979:BRS,
  author =       "Kevin W. Bowyer",
  title =        "Book review of {{\em The Structure of Computers and
                 Computations: Volume One\/}} by {David J. Kuck. John
                 Wiley \& Sons 1978}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "7",
  number =       "7",
  pages =        "27--30",
  month =        apr,
  year =         "1979",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:29 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Gibson:1979:TOR,
  author =       "Randall Gibson and Paul Anderson",
  title =        "Technical overview of the {Renaissance Octobus}
                 system",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "7",
  number =       "8",
  pages =        "2--9",
  month =        jun,
  year =         "1979",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:31 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Stevenson:1979:EEM,
  author =       "Johan W. Stevenson and Andrew S. Tanenbaum",
  title =        "Efficient encoding of machine instructions",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "7",
  number =       "8",
  pages =        "10--17",
  month =        jun,
  year =         "1979",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:31 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Keedy:1979:MUS,
  author =       "J. L. Keedy",
  title =        "More on the use of stacks in the evaluation of
                 expressions",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "7",
  number =       "8",
  pages =        "18--22",
  month =        jun,
  year =         "1979",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:31 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Quick:1979:IMP,
  author =       "G. E. Quick",
  title =        "Intelligent memory: ``a parallel processing
                 concept''",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "7",
  number =       "8",
  pages =        "23--28",
  month =        jun,
  year =         "1979",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:31 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Rivest:1979:BCA,
  author =       "Ronald L. Rivest",
  title =        "The {BLIZZARD} computer architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "7",
  number =       "9",
  pages =        "2--10",
  month =        aug,
  year =         "1979",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:31 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Keedy:1979:TPR,
  author =       "J. L. Keedy",
  title =        "A technique for passing reference parameters in an
                 information-hiding architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "7",
  number =       "9",
  pages =        "11--15",
  month =        aug,
  year =         "1979",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:31 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kavipurapu:1979:QAU,
  author =       "Krishna M. Kavipurapu and Dennis J. Frailey",
  title =        "Quantification of architectures using software
                 science",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "7",
  number =       "10",
  pages =        "2--6",
  month =        oct,
  year =         "1979",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:26 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Turton:1979:PHS,
  author =       "Trevor Turton",
  title =        "A proposed high-speed computer design",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "7",
  number =       "10",
  pages =        "7--21",
  month =        oct,
  year =         "1979",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:26 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Staff:1979:CL,
  author =       "{Computer Architecture News} staff",
  title =        "In the current literature",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "7",
  number =       "10",
  pages =        "22--22",
  month =        oct,
  year =         "1979",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:26 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

%%% TO DO: [04-Sep-2014] Volume 8 number 1: no data yet in ACM Portal database

@Article{Richards:1980:CE,
  author =       "Dana Richards",
  title =        "On a {``Counter--Example''}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "8",
  number =       "2",
  pages =        "2--3",
  month =        apr,
  year =         "1980",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:39 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Denning:1980:WIC,
  author =       "Peter J. Denning",
  title =        "Why not innovations in computer architecture?",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "8",
  number =       "2",
  pages =        "4--7",
  month =        apr,
  year =         "1980",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:39 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Gerrity:1980:HDU,
  author =       "G. W. Gerrity",
  title =        "Hardware detection of undefined references",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "8",
  number =       "2",
  pages =        "8--11",
  month =        apr,
  year =         "1980",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:39 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Denning:1980:MCS,
  author =       "Peter J. Denning and T. Don Dennis",
  title =        "On minimizing contention at semaphores",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "8",
  number =       "2",
  pages =        "12--19",
  month =        apr,
  year =         "1980",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:39 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Dennis:1980:BBD,
  author =       "Jack B. Dennis and G. Andrew Boughton and Clement K.
                 C. Leung",
  title =        "Building blocks for data flow prototypes",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "8",
  number =       "3",
  pages =        "1--8",
  year =         "1980",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 16:54:57 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Davidson:1980:MSM,
  author =       "Edward S. Davidson",
  title =        "A multiple stream microprocessor prototype system:
                 {AMP-1}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "8",
  number =       "3",
  pages =        "9--16",
  year =         "1980",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 16:54:57 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Andre:1980:KAO,
  author =       "F. Andre and J. P. Ban{\^a}tre and H. Leroy and G.
                 Paget and F. Ployette and J. P. Routeau",
  title =        "{KENSUR}: An architecture oriented towards programming
                 languages translation",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "8",
  number =       "3",
  pages =        "17--22",
  year =         "1980",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 16:54:57 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kuhl:1980:DFT,
  author =       "J. G. Kuhl and S. M. Reddy",
  title =        "Distributed fault-tolerance for large multiprocessor
                 systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "8",
  number =       "3",
  pages =        "23--30",
  year =         "1980",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 16:54:57 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Malek:1980:CCA,
  author =       "Miroslaw Malek",
  title =        "A comparison connection assignment for diagnosis of
                 multiprocessor systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "8",
  number =       "3",
  pages =        "31--36",
  year =         "1980",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 16:54:57 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Grosspietsch:1980:CTR,
  author =       "K. E. Grosspietsch and J. Kaiser and E. Nett",
  title =        "A concept for test and reconfiguration of a
                 fault-tolerant {VLSI} processor system",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "8",
  number =       "3",
  pages =        "37--43",
  year =         "1980",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 16:54:57 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Brassard:1980:PBC,
  author =       "Jean-Paul Brassard and Jan Gecsei",
  title =        "Path building in cellular partitioning networks",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "8",
  number =       "3",
  pages =        "44--50",
  year =         "1980",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 16:54:57 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{McMillen:1980:MMC,
  author =       "Robert J. McMillen and Howard Jay Siegel",
  title =        "{MIMD} machine communication using the augmented data
                 manipulator network",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "8",
  number =       "3",
  pages =        "51--60",
  year =         "1980",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 16:54:57 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Shen:1980:FTC,
  author =       "John P. Shen and John P. Hayes",
  title =        "Fault tolerance of a class of connecting networks",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "8",
  number =       "3",
  pages =        "61--71",
  year =         "1980",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 16:54:57 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Coffman:1980:CBS,
  author =       "E. G. {Coffman, Jr.} and Kimming So",
  title =        "On the comparison between single and multiple
                 processor systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "8",
  number =       "3",
  pages =        "72--79",
  year =         "1980",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 16:54:57 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hamacher:1980:PCF,
  author =       "V. Carl Hamacher and Gerald S. Shedler",
  title =        "Performance of a collision-free local bus network
                 having asynchronous distributed control",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "8",
  number =       "3",
  pages =        "80--87",
  year =         "1980",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 16:54:57 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Zuberek:1980:TPN,
  author =       "W. M. Zuberek",
  title =        "Timed {Petri} nets and preliminary performance
                 evaluation",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "8",
  number =       "3",
  pages =        "88--96",
  year =         "1980",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 16:54:57 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ditzel:1980:RHL,
  author =       "David R. Ditzel and David A. Patterson",
  title =        "Retrospective on high-level language computer
                 architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "8",
  number =       "3",
  pages =        "97--104",
  year =         "1980",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 16:54:57 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Sansonnet:1980:MLD,
  author =       "J. P. Sansonnet and M. Castan and C. Percebois",
  title =        "{M3L}: a list-directed architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "8",
  number =       "3",
  pages =        "105--112",
  year =         "1980",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 16:54:57 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hibino:1980:PPG,
  author =       "Yasushi Hibino",
  title =        "A Practical Parallel Garbage Collection Algorithm and
                 Its Implementation",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "8",
  number =       "3",
  pages =        "113--120",
  month =        may,
  year =         "1980",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 16:54:57 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  URL =          "ftp://ftp.math.utah.edu/pub/mirrors/ftp.ira.uka.de/bibliography/Compiler/garbage.collection.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "Hardware assisted GC",
}

@Article{Treleaven:1980:MPR,
  author =       "Philip C. Treleaven and Geoffrey F. Mole",
  title =        "A multi-processor reduction machine for user-defined
                 reduction languages",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "8",
  number =       "3",
  pages =        "121--130",
  year =         "1980",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 16:54:57 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Tobias:1980:SUM,
  author =       "Jeffrey M. Tobias",
  title =        "A single user multiprocessor incorporating processor
                 manipulation facilities",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "8",
  number =       "3",
  pages =        "131--138",
  year =         "1980",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 16:54:57 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Halstead:1980:MSD,
  author =       "Robert H. {Halstead, Jr.} and Stephen A. Ward",
  title =        "The {MuNet}: a scalable decentralized architecture for
                 parallel computation",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "8",
  number =       "3",
  pages =        "139--145",
  year =         "1980",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 16:54:57 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Lampson:1980:PHP,
  author =       "Butler W. Lampson and Kenneth A. Pier",
  title =        "A processor for a high-performance personal computer",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "8",
  number =       "3",
  pages =        "146--160",
  year =         "1980",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 16:54:57 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Edwards:1980:MGN,
  author =       "D. B. G. Edwards and A. E. Knowles and J. V. Woods",
  title =        "{MU6-G}: a new design to achieve mainframe performance
                 from a mini-sized computer",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "8",
  number =       "3",
  pages =        "161--167",
  year =         "1980",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 16:54:57 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Batcher:1980:AMP,
  author =       "Kenneth E. Batcher",
  title =        "Architecture of a massively parallel processor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "8",
  number =       "3",
  pages =        "168--173",
  year =         "1980",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 16:54:57 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Palmer:1980:IND,
  author =       "John Palmer",
  title =        "The {Intel 8087} numeric data processor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "8",
  number =       "3",
  pages =        "174--181",
  year =         "1980",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 16:54:57 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kuhn:1980:EMA,
  author =       "Robert H. Kuhn",
  title =        "Efficient mapping of algorithms to single-stage
                 interconnections",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "8",
  number =       "3",
  pages =        "182--189",
  year =         "1980",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 16:54:57 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Nassimi:1980:SRB,
  author =       "David Nassimi and Sartaj Sahni",
  title =        "A self routing {Benes} network",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "8",
  number =       "3",
  pages =        "190--195",
  year =         "1980",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 16:54:57 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{vonIssendorff:1980:ANF,
  author =       "H. von Issendorff and W. Gr{\"u}newald",
  title =        "An adaptable network for functional distributed
                 systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "8",
  number =       "3",
  pages =        "196--201",
  year =         "1980",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 16:54:57 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Riad:1980:CFC,
  author =       "Mokhtar Boshra Riad",
  title =        "A combination of field and current access techniques
                 for efficient and cost-effective bubble memories",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "8",
  number =       "3",
  pages =        "202--210",
  year =         "1980",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 16:54:57 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Trivedi:1980:DLS,
  author =       "K. S. Trivedi",
  title =        "Designing linear storage hierarchies so as to maximize
                 reliability subject to cost and performance
                 constraints",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "8",
  number =       "3",
  pages =        "211--217",
  year =         "1980",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 16:54:57 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ahuja:1980:APP,
  author =       "Sudhir R. Ahuja and Charles S. Roberts",
  title =        "An associative\slash parallel processor for partial
                 match retrieval using superimposed codes",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "8",
  number =       "3",
  pages =        "218--227",
  year =         "1980",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 16:54:57 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ruggiero:1980:MBV,
  author =       "M. D. Ruggiero and S. G. Zaky",
  title =        "A microprocessor-based virtual memory system",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "8",
  number =       "3",
  pages =        "228--235",
  year =         "1980",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 16:54:57 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Jagannathan:1980:TAI,
  author =       "Anand Jagannathan",
  title =        "A technique for the architectural implementation of
                 software subsystems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "8",
  number =       "3",
  pages =        "236--244",
  year =         "1980",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 16:54:57 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Berstis:1980:SPD,
  author =       "Viktors Berstis",
  title =        "Security and protection of data in the {IBM
                 System\slash 38}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "8",
  number =       "3",
  pages =        "245--252",
  year =         "1980",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 16:54:57 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hoffmann:1980:HIC,
  author =       "Miguel Garc{\'\i}a Hoffmann",
  title =        "Hardware implementation of communication protocols:
                 a formal approach",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "8",
  number =       "3",
  pages =        "253--263",
  year =         "1980",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 16:54:57 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Guillier:1980:ACF,
  author =       "P. Guillier and D. Slosberg",
  title =        "An architecture with comprehensive facilities of
                 inter-process synchronization and communication",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "8",
  number =       "3",
  pages =        "264--270",
  year =         "1980",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 16:54:57 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Lougheed:1980:CPP,
  author =       "Robert M. Lougheed and David L. McCubbrey",
  title =        "The cytocomputer: a practical pipelined image
                 processor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "8",
  number =       "3",
  pages =        "271--277",
  year =         "1980",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 16:54:57 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Halatsis:1980:ACM,
  author =       "C. Halatsis and A. van Dam and J. Joosten and M.
                 Letheren",
  title =        "Architectural considerations for a microprogrammable
                 emulating engine using bit-slices",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "8",
  number =       "3",
  pages =        "278--291",
  year =         "1980",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 16:54:57 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Irwin:1980:OPS,
  author =       "Mary Jane Irwin and Don Heller",
  title =        "Online pipeline systems for recursive numeric
                 computations",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "8",
  number =       "3",
  pages =        "292--299",
  year =         "1980",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 16:54:57 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Foster:1980:DSP,
  author =       "M. J. Foster and H. T. Kung",
  title =        "Design of special-purpose {VLSI} chips: Example and
                 opinions",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "8",
  number =       "3",
  pages =        "300--307",
  year =         "1980",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 16:54:57 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kumar:1980:SLC,
  author =       "Anshul Kumar and P. C. P. Bhatt",
  title =        "A structured language for {CAD} of digital systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "8",
  number =       "3",
  pages =        "308--316",
  year =         "1980",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 16:54:57 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hercksen:1980:HMS,
  author =       "Uwe Hercksen and Rainer Klar and Wolfgang
                 Klein{\"o}der",
  title =        "Hardware-measurements of storage access conflicts in
                 the processor array {EGPA(1)}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "8",
  number =       "3",
  pages =        "317--324",
  year =         "1980",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 16:54:57 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Tokoro:1980:HLM,
  author =       "Mario Tokoro and Kiichiro Tamaru and Masaaki Mizuno
                 and Masao Hori",
  title =        "A high level multi-lingual multiprocessor {KMP\slash
                 II}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "8",
  number =       "3",
  pages =        "325--333",
  year =         "1980",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 16:54:57 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Aupperle:1980:RIC,
  author =       "Ken Aupperle",
  title =        "A real innovation in computer architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "8",
  number =       "4",
  pages =        "6--7",
  month =        jun,
  year =         "1980",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:07 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Galloway:1980:AIR,
  author =       "John R. {Galloway, Jr.}",
  title =        "Architectural innovation round: round \#3",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "8",
  number =       "4",
  pages =        "8--10",
  month =        jun,
  year =         "1980",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:07 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Sharp:1980:STD,
  author =       "John A. Sharp",
  title =        "Some thoughts on data flow architectures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "8",
  number =       "4",
  pages =        "11--21",
  month =        jun,
  year =         "1980",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:07 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Payne:1980:VFP,
  author =       "Mary Payne and Dileep Bhandarkar",
  title =        "{VAX} floating point: a solid foundation for numerical
                 computation",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "8",
  number =       "4",
  pages =        "22--33",
  month =        jun,
  year =         "1980",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/641845.641849",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Sat Jun 24 12:02:21 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Dickman:1980:TR,
  author =       "Lloyd Dickman",
  title =        "Treasurer's report",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "8",
  number =       "4",
  pages =        "37--38",
  month =        jun,
  year =         "1980",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:07 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Staff:1980:CLAa,
  author =       "{Computer Architecture News} staff",
  title =        "Current literature: abstracts of articles of
                 interest\ldots{}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "8",
  number =       "4",
  pages =        "48--48",
  month =        jun,
  year =         "1980",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:07 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Davies:1980:CAM,
  author =       "Julian Davies",
  title =        "Clock architecture and management",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "8",
  number =       "5",
  pages =        "3--6",
  month =        aug,
  year =         "1980",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:16 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Chroust:1980:RMO,
  author =       "G. Chroust and J. R. M{\"u}hlbacher",
  title =        "Rivalling multiprocessor organization: a
                 hardware\slash speed trade-off",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "8",
  number =       "5",
  pages =        "7--10",
  month =        aug,
  year =         "1980",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:16 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Stevenson:1980:RPI,
  author =       "David Stevenson",
  title =        "A report on the proposed {IEEE Floating Point Standard
                 (IEEE Task p754)}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "8",
  number =       "5",
  pages =        "11--12",
  month =        aug,
  year =         "1980",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:16 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Rattner:1980:OBC,
  author =       "Justin Rattner and George Cox",
  title =        "Object-based computer architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "8",
  number =       "6",
  pages =        "4--11",
  month =        oct,
  year =         "1980",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:29 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Myers:1980:HIC,
  author =       "G. J. Myers and B. R. S. Buckingham",
  title =        "A hardware implementation of capability-based
                 addressing",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "8",
  number =       "6",
  pages =        "12--24",
  month =        oct,
  year =         "1980",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:29 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Patterson:1980:CRI,
  author =       "David A. Patterson and David R. Ditzel",
  title =        "The case for the reduced instruction set computer",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "8",
  number =       "6",
  pages =        "25--33",
  month =        oct,
  year =         "1980",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:29 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Clark:1980:CCR,
  author =       "Douglas W. Clark and William D. Strecker",
  title =        "Comments on {``The Case for the Reduced Instruction
                 Set Computer,''} by {Patterson} and {Ditzel}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "8",
  number =       "6",
  pages =        "34--38",
  month =        oct,
  year =         "1980",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:29 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Brakefield:1980:BAT,
  author =       "James C. Brakefield",
  title =        "Is 32 bits of address too much?",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "8",
  number =       "6",
  pages =        "39--40",
  month =        oct,
  year =         "1980",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:29 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Brakefield:1980:PB,
  author =       "James C. Brakefield",
  title =        "The peripheral bus",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "8",
  number =       "6",
  pages =        "41--43",
  month =        oct,
  year =         "1980",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:29 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Mudge:1980:BRR,
  author =       "Trevor Mudge",
  title =        "Book reviews: Review of {{\em The Structure of
                 Computers and Computation, Vol. I\/}} by {David J.
                 Kuck, John Wiley \& and Sons 1978}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "8",
  number =       "6",
  pages =        "44--45",
  month =        oct,
  year =         "1980",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:29 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Staff:1980:CLAb,
  author =       "Computer Architecture News Staff",
  title =        "Current literature: abstracts of articles of
                 interest\ldots{}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "8",
  number =       "6",
  pages =        "46--46",
  month =        oct,
  year =         "1980",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:29 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Reed:1980:WFC,
  author =       "Karl Reed",
  title =        "The way forward in computer architecture research",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "8",
  number =       "7",
  pages =        "3--7",
  month =        oct,
  year =         "1980",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Gilmore:1980:SEM,
  author =       "John Gilmore",
  title =        "Suggested enhancements to the {Motorola MC68000}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "8",
  number =       "7",
  pages =        "8--14",
  month =        oct,
  year =         "1980",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Wakerly:1980:PED,
  author =       "John F. Wakerly",
  title =        "{Pascal} extensions for describing computer
                 instruction sets",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "8",
  number =       "7",
  pages =        "15--23",
  month =        oct,
  year =         "1980",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kavi:1980:SA,
  author =       "Krishna M. Kavi",
  title =        "Semantics of an algorithm",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "8",
  number =       "7",
  pages =        "24--26",
  month =        oct,
  year =         "1980",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Treleaven:1980:VMA,
  author =       "Philip C. Treleaven",
  title =        "{VLSI}: machine architecture and very high level
                 languages",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "8",
  number =       "7",
  pages =        "27--38",
  month =        oct,
  year =         "1980",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Dickman:1981:SB,
  author =       "Lloyd Dickman",
  title =        "{SIGARCH} business",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "9",
  number =       "1",
  pages =        "7--8",
  month =        feb,
  year =         "1981",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:28 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{DePrycker:1981:NIM,
  author =       "Martin L. {De Prycker}",
  title =        "A new index mode for the {VAX-11}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "9",
  number =       "2",
  pages =        "10--11",
  month =        apr,
  year =         "1981",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1296940.1296941",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:58:05 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "One advantage of most high level languages over
                 machine languages consists of the availability of
                 concepts which are frequently used by most programmers.
                 One of these concepts is the array mechanism, where the
                 high level language generally provides three operations
                 associated with array manipulations: type-checking,
                 bounds-checking and address calculation.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Stevenson:1981:PP,
  author =       "David Stevenson",
  title =        "The {Phoenix Project}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "9",
  number =       "2",
  pages =        "12--15",
  month =        apr,
  year =         "1981",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1296940.1296942",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:58:05 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "The Phoenix Project was an exploration of the issues
                 surrounding large scale scientific computing. It was
                 conducted at the Institute for Advanced Computation,
                 NASA-Ames Research Center at Moffett Field, California
                 from 1975 to 1979. The primary results of the project
                 were a sizing of the likely needs of large scale
                 scientific computing during the 1980s, what computing
                 technology could be available to meet those needs, a
                 conceptual design of a processor that could meet those
                 needs, and a programming, language suitable for use by
                 this community on a parallel processor such as the one
                 proposed.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{VanOost:1981:MPS,
  author =       "E. M. J. C. {Van Oost}",
  title =        "Multi-processor system description and simulation
                 using structured multi-programming languages",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "9",
  number =       "2",
  pages =        "16--32",
  month =        apr,
  year =         "1981",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1296940.1296943",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:58:05 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Most of the multi-processor systems designed for real
                 time control demand a high efficiency, compromising the
                 simplicity of the system. If this requirement imposes a
                 hardware implementation of most of the primitives of
                 the system, a complicated hardware will result. In
                 order to retain to some extent the ease of using
                 structured multi-programming languages, e.g. Concurrent
                 Pascal [1], we have used these languages for the
                 description and simulation of the complex hardware,
                 instead of using them for software implementation of
                 parallelism.\par

                 This approach is explained with examples taken from an
                 existing multi-processor system [2] developed at the
                 Brussels Free University (V.U.B.).",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Wakerly:1981:BRR,
  author =       "John Wakerly",
  title =        "Book review: Review of {'The Computers that Saved
                 Metropolis, by DC Comics and Radio Shack', July 1980}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "9",
  number =       "2",
  pages =        "33--34",
  month =        apr,
  year =         "1981",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1296940.1296945",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:58:05 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

%%% TO DO: [04-Sep-2014] Volume 9 number 3: no data yet in ACM Portal database

@Article{Arvind:1981:MPD,
  author =       "Arvind and V. Kathail",
  title =        "A Multiple Processor Data Flow Machine that Supports
                 Generalized Procedures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "9",
  number =       "3",
  pages =        "??--??",
  year =         "1981",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibsource =    "ftp://ftp.math.utah.edu/pub/mirrors/ftp.ira.uka.de/bibliography/Compiler/Functional.bib;
                 http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "Proceedings of the 8th Annual Symposium on Computer
                 Architecture.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "functional dataflow",
}

@Article{Gerrity:1981:PI,
  author =       "G. W. Gerrity",
  title =        "On processes and interrupts",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "9",
  number =       "4",
  pages =        "4--14",
  month =        jun,
  year =         "1981",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:07 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hill:1981:HMS,
  author =       "Dwight D. Hill",
  title =        "A hardware mechanism for supporting range checks",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "9",
  number =       "4",
  pages =        "15--21",
  month =        jun,
  year =         "1981",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:07 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Cherniavsky:1981:CMA,
  author =       "Vladimir S. Cherniavsky",
  title =        "The computing memory another distributed computer
                 architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "9",
  number =       "4",
  pages =        "22--24",
  month =        jun,
  year =         "1981",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:07 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Thornton:1981:ASC,
  author =       "James E. Thornton",
  title =        "{8th Annual Symposium on Computer Architecture:
                 Heterogeneous Computer Architecture}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "9",
  number =       "4",
  pages =        "25--33",
  month =        jun,
  year =         "1981",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:07 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Staff:1981:ETP,
  author =       "Computer Architecture News Staff",
  title =        "Errata for two publications",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "9",
  number =       "4",
  pages =        "34--34",
  month =        jun,
  year =         "1981",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:07 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Lindsay:1981:CMM,
  author =       "Donald C. Lindsay",
  title =        "Cache memory for microprocessors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "9",
  number =       "5",
  pages =        "6--13",
  month =        aug,
  year =         "1981",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1296947.1296948",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 12:06:16 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "A growth path for current microprocessors is suggested
                 which includes bus enhancements and cache memories. The
                 implications are examined, and several differences from
                 the mainframe world are pointed out.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kavi:1981:IAC,
  author =       "Krishna M. Kavi",
  title =        "Innovative architectures and commercial computers: a
                 summary of the panel discussion at {NCC 1981}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "9",
  number =       "5",
  pages =        "14--16",
  month =        aug,
  year =         "1981",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1296947.1296949",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 12:06:16 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "The session was held on May 4, 1981 in Chicago at NCC
                 1981. The panelists were Harvey Cragon, Pat Goldberg,
                 Dave Patterson, Justin Rattner, Dean Earnest and Peter
                 Denning. Krishna Kavi was the moderator. A complete
                 report of the session is available and can be obtained
                 by writing to the Computer Science Department, P. O.
                 Box 44330, U.S.L., Lafayette, LA 70504.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Jenevein:1981:EHS,
  author =       "R. M. Jenevein and ?. DeGroot and G. Jack Lipovski",
  title =        "Errata: ``{A} hardware support mechanism for
                 scheduling resources in parallel machine environment'':
                 (from {Proceedings of the 8th Annual Symposium on
                 Computer Architecture}, p. 57)",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "9",
  number =       "5",
  pages =        "17--17",
  month =        aug,
  year =         "1981",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1296947.1296950",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 12:06:16 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Yuen:1981:EPS,
  author =       "C. K. Yuen",
  title =        "Extending the power of short-wordlength processors by
                 means of context-dependent machine instructions",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "9",
  number =       "6",
  pages =        "9--15",
  month =        oct,
  year =         "1981",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:29 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Gottlieb:1981:CPP,
  author =       "Allan Gottlieb and Clyde P. Kruskal",
  title =        "Coordinating parallel processors: a partial
                 unification",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "9",
  number =       "6",
  pages =        "16--24",
  month =        oct,
  year =         "1981",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:29 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Anonymous:1981:ESM,
  author =       "Anonymous",
  title =        "Errata: Structured machine design: an ongoing
                 experiment",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "9",
  number =       "6",
  pages =        "25--25",
  month =        oct,
  year =         "1981",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:29 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{McDowell:1982:PML,
  author =       "Charlie McDowell",
  title =        "Protection at the micromachine level",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "10",
  number =       "1",
  pages =        "4--8",
  month =        jan,
  year =         "1982",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:29 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Feustel:1982:PPC,
  author =       "Edward A. Feustel",
  title =        "Protected procedure call on the {PRIME(TM)} machines",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "10",
  number =       "1",
  pages =        "9--22",
  month =        jan,
  year =         "1982",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:29 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{El-Halabi:1982:SRD,
  author =       "Hossam El-Halabi and Dharma P. Agrawal",
  title =        "Some remarks on direct execution computers",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "10",
  number =       "1",
  pages =        "23--27",
  month =        jan,
  year =         "1982",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:29 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Fitzpatrick:1982:RAV,
  author =       "Daniel T. Fitzpatrick and John K. Foderaro and Manolis
                 G. H. Katevenis and Howard A. Landman and David
                 A. Patterson and James B. Peek and Zvi Peshkess and
                 Carlo H. S{\'e}quin and Robert W. Sherburne and Korbin
                 S. {Van Dyke}",
  title =        "A {RISCy} approach to {VLSI}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "10",
  number =       "1",
  pages =        "28--32",
  month =        jan,
  year =         "1982",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:29 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Rattner:1982:HSC,
  author =       "Justin Rattner",
  title =        "Hardware\slash software cooperation in the
                 {iAPX-432}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "10",
  number =       "2",
  pages =        "1--1",
  month =        mar,
  year =         "1982",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:44 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hennessy:1982:HST,
  author =       "John Hennessy and Norman Jouppi and Forest Baskett and
                 Thomas Gross and John Gill",
  title =        "Hardware\slash software tradeoffs for increased
                 performance",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "10",
  number =       "2",
  pages =        "2--11",
  month =        mar,
  year =         "1982",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:44 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Rymarczyk:1982:CGP,
  author =       "James W. Rymarczyk",
  title =        "Coding guidelines for pipelined processors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "10",
  number =       "2",
  pages =        "12--19",
  month =        mar,
  year =         "1982",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:44 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Johnsson:1982:OMP,
  author =       "Richard K. Johnsson and John D. Wick",
  title =        "An overview of the mesa processor architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "10",
  number =       "2",
  pages =        "20--29",
  month =        mar,
  year =         "1982",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:44 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Berenbaum:1982:OSL,
  author =       "Alan D. Berenbaum and Michael W. Condry and Priscilla
                 M. Lu",
  title =        "The operating system and language support features of
                 the {BELLMACTM-32} microprocessor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "10",
  number =       "2",
  pages =        "30--38",
  month =        mar,
  year =         "1982",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:44 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Radin:1982:M,
  author =       "George Radin",
  title =        "The 801 minicomputer",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "10",
  number =       "2",
  pages =        "39--47",
  month =        mar,
  year =         "1982",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:44 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ditzel:1982:RAF,
  author =       "David R. Ditzel and H. R. McLellan",
  title =        "Register allocation for free: {The C} machine stack
                 cache",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "10",
  number =       "2",
  pages =        "48--56",
  month =        mar,
  year =         "1982",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:44 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Harbison:1982:AAO,
  author =       "Samuel P. Harbison",
  title =        "An architectural alternative to optimizing compilers",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "10",
  number =       "2",
  pages =        "57--65",
  month =        mar,
  year =         "1982",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:44 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Lampson:1982:FPC,
  author =       "Butler W. Lampson",
  title =        "Fast procedure calls",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "10",
  number =       "2",
  pages =        "66--76",
  month =        mar,
  year =         "1982",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:44 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Jones:1982:SPM,
  author =       "Douglas W. Jones",
  title =        "Systematic protection mechanism design",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "10",
  number =       "2",
  pages =        "77--80",
  month =        mar,
  year =         "1982",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:44 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Reed:1982:GPM,
  author =       "Karl Reed",
  title =        "On a general property of memory mapping tables",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "10",
  number =       "2",
  pages =        "81--86",
  month =        mar,
  year =         "1982",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:44 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Cook:1982:EIO,
  author =       "Robert P. Cook and Nitin Donde",
  title =        "An experiment to improve operand addressing",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "10",
  number =       "2",
  pages =        "87--91",
  month =        mar,
  year =         "1982",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:44 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Fusaoka:1982:CCH,
  author =       "Akira Fusaoka and Masaharu Hirayama",
  title =        "Compiler chip: a hardware implementation of compiler",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "10",
  number =       "2",
  pages =        "92--95",
  month =        mar,
  year =         "1982",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:44 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Rau:1982:ASE,
  author =       "B. R. Rau and C. D. Glaeser and E. M. Greenawalt",
  title =        "Architectural support for the efficient generation of
                 code for horizontal architectures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "10",
  number =       "2",
  pages =        "96--99",
  month =        mar,
  year =         "1982",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:44 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{McLear:1982:GCD,
  author =       "R. E. McLear and D. M. Scheibelhut and E. Tammaru",
  title =        "Guidelines for creating a debuggable processor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "10",
  number =       "2",
  pages =        "100--106",
  month =        mar,
  year =         "1982",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:44 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Wilkes:1982:HSM,
  author =       "M. V. Wilkes",
  title =        "Hardware support for memory protection: {Capability}
                 implementations",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "10",
  number =       "2",
  pages =        "107--116",
  month =        mar,
  year =         "1982",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:44 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Pollack:1982:SAM,
  author =       "Fred J. Pollack and George W. Cox and Dan W.
                 Hammerstrom and Kevin C. Kahn and Konrad K. Lai and
                 Justin R. Rattner",
  title =        "Supporting {Ada} memory management in the {iAPX-432}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "10",
  number =       "2",
  pages =        "117--131",
  month =        mar,
  year =         "1982",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:44 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Sansonnet:1982:DEL,
  author =       "J. P. Sansonnet and M. Castan and C. Percebois and D.
                 Botella and J. Perez",
  title =        "Direct execution of {Lisp} on a list-directed
                 architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "10",
  number =       "2",
  pages =        "132--139",
  month =        mar,
  year =         "1982",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:44 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Johnson:1982:SRA,
  author =       "Mark Scott Johnson",
  title =        "Some requirements for architectural support of
                 software debugging",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "10",
  number =       "2",
  pages =        "140--148",
  month =        mar,
  year =         "1982",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:44 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Middelburg:1982:EPA,
  author =       "C. A. Middelburg",
  title =        "The effect of the {PDP-11} architecture on code
                 generation for chill",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "10",
  number =       "2",
  pages =        "149--157",
  month =        mar,
  year =         "1982",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:44 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Sweet:1982:EAM,
  author =       "Richard E. Sweet and James G. {Sandman, Jr.}",
  title =        "Empirical analysis of the mesa instruction set",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "10",
  number =       "2",
  pages =        "158--166",
  month =        mar,
  year =         "1982",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:44 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{McDaniel:1982:AMI,
  author =       "Gene McDaniel",
  title =        "An analysis of a mesa instruction set using dynamic
                 instruction frequencies",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "10",
  number =       "2",
  pages =        "167--176",
  month =        mar,
  year =         "1982",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:44 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Wiecek:1982:CSV,
  author =       "Cheryl A. Wiecek",
  title =        "A case study of {VAX-11} instruction set usage for
                 compiler execution",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "10",
  number =       "2",
  pages =        "177--184",
  month =        mar,
  year =         "1982",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:44 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Maekawa:1982:FSA,
  author =       "Mamoru Maekawa and Ken Sakamura and Chiaki Ishikawa",
  title =        "Firmware structure and architectural support for
                 monitors, vertical migration and user
                 microprogramming",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "10",
  number =       "2",
  pages =        "185--194",
  month =        mar,
  year =         "1982",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:44 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kamibayashi:1982:HOS,
  author =       "N. Kamibayashi and H. Ogawana and K. Nagayama and H.
                 Aiso",
  title =        "{Heart}: an operating system nucleus machine
                 implemented by firmware",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "10",
  number =       "2",
  pages =        "195--204",
  month =        mar,
  year =         "1982",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:44 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ahuja:1982:MMA,
  author =       "Sudhir R. Ahuja and Abhaya Asthana",
  title =        "A multi-microprocessor architecture with hardware
                 support for communication and scheduling",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "10",
  number =       "2",
  pages =        "205--209",
  month =        mar,
  year =         "1982",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:44 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Patterson:1982:RAH,
  author =       "David A. Patterson and Richard S. Piepho",
  title =        "{RISC} assessment: a high-level language experiment",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "10",
  number =       "3",
  pages =        "3--8",
  month =        apr,
  year =         "1982",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:52 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Clark:1982:MAI,
  author =       "Douglas W. Clark and Henry M. Levy",
  title =        "Measurement and analysis of instruction use in the
                 {VAX-11\slash 780}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "10",
  number =       "3",
  pages =        "9--17",
  month =        apr,
  year =         "1982",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:52 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kavi:1982:HAP,
  author =       "Krishna Kavi and Boumediene Belkhouche and Evelyn
                 Bullard and Lois Delcambre and Stephen Nemecek",
  title =        "{HLL} architectures: {Pitfalls} and predilections",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "10",
  number =       "3",
  pages =        "18--23",
  month =        apr,
  year =         "1982",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:52 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Gottlieb:1982:NUD,
  author =       "Allan Gottlieb and Ralph Grishman and Clyde P. Kruskal
                 and Kevin P. McAuliffe and Larry Rudolph and Marc
                 Snir",
  title =        "The {NYU Ultracomputer}---designing a {MIMD},
                 shared-memory parallel machine (extended abstract)",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "10",
  number =       "3",
  pages =        "27--42",
  month =        apr,
  year =         "1982",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:52 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Chu:1982:VAH,
  author =       "King-Hang Chu and King-Sun Fu",
  title =        "{VLSI} architectures for high speed recognition of
                 context-free languages and finite-state languages",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "10",
  number =       "3",
  pages =        "43--49",
  month =        apr,
  year =         "1982",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:52 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Franklin:1982:ACC,
  author =       "Mark A. Franklin and Donald F. Wann",
  title =        "Asynchronous and clocked control structures for {VLSI}
                 based interconnection networks",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "10",
  number =       "3",
  pages =        "50--59",
  month =        apr,
  year =         "1982",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:52 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{McMillen:1982:PFT,
  author =       "Robert J. McMillen and Howard Jay Siegel",
  title =        "Performance and fault tolerance improvements in the
                 {Inverse Augmented Data Manipulator} network",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "10",
  number =       "3",
  pages =        "63--72",
  month =        apr,
  year =         "1982",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:52 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Parker:1982:GNM,
  author =       "D. S. Parker and C. S. Raghavendra",
  title =        "The {Gamma} network: a multiprocessor interconnection
                 network with redundant paths",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "10",
  number =       "3",
  pages =        "73--80",
  month =        apr,
  year =         "1982",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:52 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Jenevein:1982:CPR,
  author =       "R. M. Jenevein and J. C. Browne",
  title =        "A control processor for a reconfigurable array
                 computer",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "10",
  number =       "3",
  pages =        "81--89",
  month =        apr,
  year =         "1982",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:52 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Bhuyan:1982:GCP,
  author =       "Laxmi N. Bhuyan and Dharma P. Agrawal",
  title =        "A general class of processor interconnection
                 strategies",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "10",
  number =       "3",
  pages =        "90--98",
  month =        apr,
  year =         "1982",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:52 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Burkowski:1982:ISD,
  author =       "F. J. Burkowski",
  title =        "Instruction set design issues relating to a static
                 dataflow computer",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "10",
  number =       "3",
  pages =        "101--111",
  month =        apr,
  year =         "1982",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:52 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Smith:1982:DAE,
  author =       "James E. Smith",
  title =        "Decoupled access\slash execute computer
                 architectures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "10",
  number =       "3",
  pages =        "112--119",
  month =        apr,
  year =         "1982",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:52 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Caluwaerts:1982:DFA,
  author =       "L. J. Caluwaerts and J. Debacker and J. A.
                 Peperstraete",
  title =        "A data flow architecture with a paged memory system",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "10",
  number =       "3",
  pages =        "120--127",
  month =        apr,
  year =         "1982",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:52 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Rau:1982:ECG,
  author =       "B. Ramakrishna Rau and Christopher D. Glaeser and
                 Raymond L. Picard",
  title =        "Efficient code generation for horizontal
                 architectures: {Compiler} techniques and architectural
                 support",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "10",
  number =       "3",
  pages =        "131--139",
  month =        apr,
  year =         "1982",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:52 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Barton:1982:SNH,
  author =       "Gene C. Barton",
  title =        "{Sentry}: a novel hardware implementation of classic
                 operating system mechanisms",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "10",
  number =       "3",
  pages =        "140--147",
  month =        apr,
  year =         "1982",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:52 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Abramovici:1982:LSM,
  author =       "M. Abramovici and Y. H. Levendel and P. R. Menon",
  title =        "A logic simulation machine",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "10",
  number =       "3",
  pages =        "148--157",
  month =        apr,
  year =         "1982",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:52 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Dasgupta:1982:TFL,
  author =       "Subrata Dasgupta and Marius Olafsson",
  title =        "Towards a family of languages for the design and
                 implementation of machine architectures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "10",
  number =       "3",
  pages =        "158--167",
  month =        apr,
  year =         "1982",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:52 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Lee:1982:RPD,
  author =       "Yann-Hang Lee and Kang G. Shin",
  title =        "Rollback propagation detection and performance
                 evaluation of {FTMR2M}---a fault-tolerant
                 multiprocessor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "10",
  number =       "3",
  pages =        "171--180",
  month =        apr,
  year =         "1982",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:52 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Lin:1982:DFT,
  author =       "Woei Lin and Chuan-lin Wu",
  title =        "Design of a $ 2 \times 2 $ fault-tolerant switching
                 element",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "10",
  number =       "3",
  pages =        "181--189",
  month =        apr,
  year =         "1982",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:52 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Fussell:1982:FTW,
  author =       "Donald Fussell and Peter Varman",
  title =        "Fault-tolerant wafer-scale architectures for {VLSI}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "10",
  number =       "3",
  pages =        "190--198",
  month =        apr,
  year =         "1982",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:52 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Pramanik:1982:DF,
  author =       "Sakti Pramanik",
  title =        "Database filters",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "10",
  number =       "3",
  pages =        "201--210",
  month =        apr,
  year =         "1982",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:52 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Tokoro:1982:SSI,
  author =       "Mario Tokoro and Takashi Takizuka",
  title =        "On the semantic structure of information --- a
                 proposal of the abstract storage architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "10",
  number =       "3",
  pages =        "211--217",
  month =        apr,
  year =         "1982",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:52 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Dohi:1982:HSA,
  author =       "Yasunori Dohi and Akira Suzuki and Noriyuki Matsui",
  title =        "Hardware sorter and its application to data base
                 machine",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "10",
  number =       "3",
  pages =        "218--225",
  month =        apr,
  year =         "1982",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:52 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Treleaven:1982:RCA,
  author =       "Philip C. Treleaven and Richard P. Hopkins",
  title =        "A recursive computer architecture for {VLSI}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "10",
  number =       "3",
  pages =        "229--238",
  month =        apr,
  year =         "1982",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:52 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Castan:1982:HRP,
  author =       "M. Castan and E. I. Organick",
  title =        "{$ \mu $3L}: an {HLL-RISC} processor for parallel
                 execution of {FP}-language programs",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "10",
  number =       "3",
  pages =        "239--247",
  month =        apr,
  year =         "1982",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:52 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hommes:1982:HSC,
  author =       "F. Hommes",
  title =        "The heap\slash substitution concept --- an
                 implementation of functional operations on data
                 structures for a reduction machine",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "10",
  number =       "3",
  pages =        "248--256",
  month =        apr,
  year =         "1982",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:52 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Reynolds:1982:SRA,
  author =       "Paul F. {Reynolds, Jr.}",
  title =        "A shared resource algorithm for distributed
                 simulation",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "10",
  number =       "3",
  pages =        "259--266",
  month =        apr,
  year =         "1982",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:52 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Jain:1982:DPT,
  author =       "Bijendra N. Jain",
  title =        "Duplication of packets and their detection in {X.25}
                 communication protocols",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "10",
  number =       "3",
  pages =        "267--273",
  month =        apr,
  year =         "1982",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:52 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Markenscoff:1982:MPS,
  author =       "Pauline Markenscoff",
  title =        "A multiple processor system for real time control
                 tasks",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "10",
  number =       "3",
  pages =        "274--280",
  month =        apr,
  year =         "1982",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:52 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Miller:1982:HMD,
  author =       "Leslie Jill Miller",
  title =        "A heterogeneous multiprocessor design and the
                 distributed scheduling of its task group workload",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "10",
  number =       "3",
  pages =        "283--290",
  month =        apr,
  year =         "1982",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:52 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Goble:1982:DPV,
  author =       "George H. Goble and Michael H. Marsh",
  title =        "A dual processor {VAX 11\slash 780}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "10",
  number =       "3",
  pages =        "291--298",
  month =        apr,
  year =         "1982",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:52 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Dubois:1982:ECC,
  author =       "Michel Dubois and Fay{\.e} A. Briggs",
  title =        "Effects of cache coherency in multiprocessors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "10",
  number =       "3",
  pages =        "299--308",
  month =        apr,
  year =         "1982",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:52 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Mudge:1982:PAC,
  author =       "T. N. Mudge and B. A. Makrucki",
  title =        "Probabilistic analysis of a crossbar switch",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "10",
  number =       "3",
  pages =        "311--320",
  month =        apr,
  year =         "1982",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:52 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Levitan:1982:FEN,
  author =       "Steven P. Levitan and Caxton C. Foster",
  title =        "Finding an extremum in a network",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "10",
  number =       "3",
  pages =        "321--325",
  month =        apr,
  year =         "1982",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:52 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Premkumar:1982:RAR,
  author =       "U. V. Premkumar and J. C. Browne",
  title =        "Resource allocation in rectangular {SW} banyans",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "10",
  number =       "3",
  pages =        "326--333",
  month =        apr,
  year =         "1982",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:52 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Anonymous:1982:LA,
  author =       "Anonymous",
  title =        "List of authors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "10",
  number =       "3",
  pages =        "335--335",
  month =        apr,
  year =         "1982",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:52 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Mayer:1982:ABB,
  author =       "Alastair J. W. Mayer",
  title =        "The architecture of the {Burroughs B5000}: 20 years
                 later and still ahead of the times?",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "10",
  number =       "4",
  pages =        "3--10",
  month =        jun,
  year =         "1982",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/641542.641543",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:07 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Brakefield:1982:OSA,
  author =       "James C. Brakefield",
  title =        "From the other side of the {Atlantic}: how to improve
                 upon the {MU5} design",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "10",
  number =       "4",
  pages =        "11--16",
  month =        jun,
  year =         "1982",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:07 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hansen:1982:PEI,
  author =       "Paul M. Hansen and Mark A. Linton and Robert N. Mayo
                 and Marguerite Murphy and David A. Patterson",
  title =        "A performance evaluation of the {Intel iAPX 432}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "10",
  number =       "4",
  pages =        "17--26",
  month =        jun,
  year =         "1982",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:07 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Huguet:1982:PPS,
  author =       "Miquel Huguet",
  title =        "The protection of the processor status word of the
                 {PDP-11\slash 60}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "10",
  number =       "4",
  pages =        "27--30",
  month =        jun,
  year =         "1982",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:07 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Brakefield:1982:JWO,
  author =       "James Brakefield",
  title =        "Just what is an op-code?: or a universal computer
                 design",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "10",
  number =       "4",
  pages =        "31--34",
  month =        jun,
  year =         "1982",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:07 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Knott:1982:FDA,
  author =       "J. D. Knott and T. W. Crockett",
  title =        "Fair dynamic arbitration for a multiprocessor
                 communications bus",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "10",
  number =       "5",
  pages =        "4--9",
  month =        sep,
  year =         "1982",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:25 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Larus:1982:CMA,
  author =       "James R. Larus",
  title =        "A comparison of microcode, assembly code, and
                 high-level languages on the {VAX-11} and {RISC I}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "10",
  number =       "5",
  pages =        "10--15",
  month =        sep,
  year =         "1982",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:25 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Patterson:1982:PEI,
  author =       "David A. Patterson",
  title =        "A performance evaluation of the {Intel 80286}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "10",
  number =       "5",
  pages =        "16--18",
  month =        sep,
  year =         "1982",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:25 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Egan:1982:EVC,
  author =       "Rod Egan",
  title =        "The effect of {VLSI} on computer architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "10",
  number =       "5",
  pages =        "19--22",
  month =        sep,
  year =         "1982",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:25 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Benzie:1982:BRR,
  author =       "Thomas Benzie",
  title =        "Book reviews: Review of {{\em Microcomputer
                 Architecture and Programming\/}} by {John F. Wakerly,
                 John Wiley \& Sons, Inc., 1981}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "10",
  number =       "5",
  pages =        "23--23",
  month =        sep,
  year =         "1982",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:25 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Levy:1982:UBM,
  author =       "Henry M. Levy and Douglas W. Clark",
  title =        "On the use of benchmarks for measuring system
                 performance",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "10",
  number =       "6",
  pages =        "5--8",
  month =        dec,
  year =         "1982",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:26 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Schulthess:1982:ONA,
  author =       "Peter Schulthess and Fritz Vonaesch",
  title =        "{OPA}: a new architecture for {Pascal-like}
                 languages",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "10",
  number =       "6",
  pages =        "9--20",
  month =        dec,
  year =         "1982",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:26 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Brakefield:1982:TI,
  author =       "James C. Brakefield",
  title =        "Talk on interpreters",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "10",
  number =       "6",
  pages =        "21--28",
  month =        dec,
  year =         "1982",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:26 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Doran:1982:MFC,
  author =       "D. W. Doran",
  title =        "Main frame computer trends",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "10",
  number =       "6",
  pages =        "29--44",
  month =        dec,
  year =         "1982",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:26 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Gajski:1983:CLS,
  author =       "Daniel Gajski and David Kuck and Duncan Lawrie and
                 Ahmed Sameh",
  title =        "{CEDAR}: a large scale multiprocessor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "11",
  number =       "1",
  pages =        "7--11",
  month =        mar,
  year =         "1983",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{French:1983:TDF,
  author =       "Elaine French and Hugh Glaser",
  title =        "{TUKI}: a data flow processor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "11",
  number =       "1",
  pages =        "12--18",
  month =        mar,
  year =         "1983",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Marovac:1983:SAD,
  author =       "Nenad Marovac",
  title =        "A systematic approach to the design and implementation
                 of a computer instruction set",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "11",
  number =       "1",
  pages =        "19--24",
  month =        mar,
  year =         "1983",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Cragon:1983:EIS,
  author =       "Harvey Cragon",
  title =        "Executable instruction set specification",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "11",
  number =       "1",
  pages =        "25--43",
  month =        mar,
  year =         "1983",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Colwell:1983:PTR,
  author =       "Robert P. Colwell and Charles Y. Hitchcock and E.
                 Douglas Jensen",
  title =        "Peering through the {RISC\slash CISC} fog: an outline
                 of research",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "11",
  number =       "1",
  pages =        "44--50",
  month =        mar,
  year =         "1983",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Gorsline:1983:RAC,
  author =       "G. W. Gorsline",
  title =        "Review of {{\em Advances in Computer Architecture\/}}
                 by {Glenford J. Myers, John Wiley \& Sons, Inc. 1982}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "11",
  number =       "1",
  pages =        "55--55",
  month =        mar,
  year =         "1983",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Sachs:1983:BRR,
  author =       "M. W. Sachs",
  title =        "Book reviews: Review of {{\em Microcomputer
                 Interfacing\/}} by {G. Jack Lipovski, Lexington Books
                 1980}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "11",
  number =       "1",
  pages =        "55--55",
  month =        mar,
  year =         "1983",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Abramson:1983:HSP,
  author =       "David Abramson and John Rosenberg",
  title =        "Hardware support for program debuggers in a paged
                 virtual memory",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "11",
  number =       "2",
  pages =        "8--19",
  month =        jun,
  year =         "1983",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:42 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Frailey:1983:WLC,
  author =       "Dennis J. Frailey",
  title =        "Word length of a computer architecture definitions and
                 applications",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "11",
  number =       "2",
  pages =        "20--26",
  month =        jun,
  year =         "1983",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:42 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hollaar:1983:BRR,
  author =       "Lee A. Hollaar",
  title =        "Book reviews: Review of {{\em Computer Design\/}} by
                 {Glen G. Langdon, Computeach Press}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "11",
  number =       "2",
  pages =        "27--28",
  month =        jun,
  year =         "1983",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:42 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Wilkes:1983:SPS,
  author =       "Maurice V. Wilkes",
  title =        "Size, power, and speed (keynote address)",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "11",
  number =       "3",
  pages =        "2--4",
  month =        jun,
  year =         "1983",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:53 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Giloi:1983:TTC,
  author =       "W. K. Giloi",
  title =        "Towards a taxonomy of computer architecture based on
                 the machine data type view",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "11",
  number =       "3",
  pages =        "6--15",
  month =        jun,
  year =         "1983",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:53 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Avizienis:1983:FTF,
  author =       "Algirdas Avi{\v{z}}ienis",
  title =        "Framework for a taxonomy of fault-tolerance attributes
                 in computer systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "11",
  number =       "3",
  pages =        "16--21",
  month =        jun,
  year =         "1983",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:53 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Pehrson:1983:CID,
  author =       "Bj{\"o}rn Pehrson and Joachim Parrow",
  title =        "Caddie an interactive design environment",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "11",
  number =       "3",
  pages =        "24--31",
  month =        jun,
  year =         "1983",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:53 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Dasgupta:1983:VCA,
  author =       "Subrata Dasgupta",
  title =        "On the verification of computer architectures using an
                 architecture description language",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "11",
  number =       "3",
  pages =        "32--38",
  month =        jun,
  year =         "1983",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:53 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{King:1983:RSC,
  author =       "Richard M. King",
  title =        "Research on synthesis of concurrent computing systems
                 (extended abstract)",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "11",
  number =       "3",
  pages =        "39--46",
  month =        jun,
  year =         "1983",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:53 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Fisher:1983:APP,
  author =       "Allan L. Fisher and H. T. Kung and Louis M. Monier and
                 Yasunori Dohi",
  title =        "Architecture of the {PSC}---a programmable systolic
                 chip",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "11",
  number =       "3",
  pages =        "48--53",
  month =        jun,
  year =         "1983",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:53 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Fisher:1983:SLV,
  author =       "Allan L. Fisher and H. T. Kung",
  title =        "Synchronizing large {VLSI} processor arrays",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "11",
  number =       "3",
  pages =        "54--58",
  month =        jun,
  year =         "1983",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:53 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Wagner:1983:BVM,
  author =       "Robert A. Wagner",
  title =        "The {Boolean Vector Machine [BVM]}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "11",
  number =       "3",
  pages =        "59--66",
  month =        jun,
  year =         "1983",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:53 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Bonuccelli:1983:VTM,
  author =       "M. A. Bonuccelli and E. Lodi and F. Luccio and P.
                 Maestrini and L. Pagli",
  title =        "A {VLSI} tree machine for relational data bases",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "11",
  number =       "3",
  pages =        "67--73",
  month =        jun,
  year =         "1983",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:53 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Caluwaerts:1983:ISD,
  author =       "L. J. Caluwaerts and J. Debacker and J. A.
                 Peperstraete",
  title =        "Implementing streams on a data flow computer system
                 with paged memory",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "11",
  number =       "3",
  pages =        "76--83",
  month =        jun,
  year =         "1983",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:53 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Requa:1983:PDF,
  author =       "Joseph E. Requa",
  title =        "The {Piecewise Data Flow} architecture control flow
                 and register management",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "11",
  number =       "3",
  pages =        "84--89",
  month =        jun,
  year =         "1983",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:53 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Tokoro:1983:WSC,
  author =       "Mario Tokoro and J. R. Jagannathan and Hideki
                 Sunahara",
  title =        "On the working set concept for data-flow machines",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "11",
  number =       "3",
  pages =        "90--97",
  month =        jun,
  year =         "1983",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:53 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Marczynski:1983:DDS,
  author =       "R. W. Marczy{\'n}ski and J. Milewski",
  title =        "A data driven system based on a microprogrammed
                 processor module",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "11",
  number =       "3",
  pages =        "98--106",
  month =        jun,
  year =         "1983",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:53 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Patterson:1983:AVI,
  author =       "David A. Patterson and Phil Garrison and Mark Hill and
                 Dimitris Lioupis and Chris Nyberg and Tim Sippel and
                 Korbin {Van Dyke}",
  title =        "Architecture of a {VLSI} instruction cache for a
                 {RISC}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "11",
  number =       "3",
  pages =        "108--116",
  month =        jun,
  year =         "1983",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:53 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Yeh:1983:PSC,
  author =       "Phil C. C. Yeh and Janak H. Patel and Edward S.
                 Davidson",
  title =        "Performance of shared cache for parallel-pipelined
                 computer systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "11",
  number =       "3",
  pages =        "117--123",
  month =        jun,
  year =         "1983",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:53 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Goodman:1983:UCM,
  author =       "James R. Goodman",
  title =        "Using cache memory to reduce processor-memory
                 traffic",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "11",
  number =       "3",
  pages =        "124--131",
  month =        jun,
  year =         "1983",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:53 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Smith:1983:SIC,
  author =       "James E. Smith and James R. Goodman",
  title =        "A study of instruction cache organizations and
                 replacement policies",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "11",
  number =       "3",
  pages =        "132--137",
  month =        jun,
  year =         "1983",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:53 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Fisher:1983:VLI,
  author =       "Joseph A. Fisher",
  title =        "{Very Long Instruction Word} architectures and the
                 {ELI-512}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "11",
  number =       "3",
  pages =        "140--150",
  month =        jun,
  year =         "1983",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:53 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Tomita:1983:UML,
  author =       "Shinji Tomita and Kiyoshi Shibayama and Toshiaki
                 Kitamura and Toshiyuki Nakata and Hiroshi Hagiwara",
  title =        "A user-microprogrammable, local host computer with
                 low-level parallelism",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "11",
  number =       "3",
  pages =        "151--157",
  month =        jun,
  year =         "1983",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:53 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Gumpertz:1983:CTE,
  author =       "Richard H. Gumpertz",
  title =        "Combining tags with error codes",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "11",
  number =       "3",
  pages =        "160--165",
  month =        jun,
  year =         "1983",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:53 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Park:1983:FDB,
  author =       "Young Gil Park and Jung Wan Cho",
  title =        "Fault diagnosis of bit-slice processor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "11",
  number =       "3",
  pages =        "166--172",
  month =        jun,
  year =         "1983",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:53 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Fiol:1983:LDI,
  author =       "M. A. Fiol and I. Alegre and J. L. A. Yebra",
  title =        "Line digraph iterations and the (d,k) problem for
                 directed graphs",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "11",
  number =       "3",
  pages =        "174--177",
  month =        jun,
  year =         "1983",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:53 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Opper:1983:RAR,
  author =       "Eli Opper and Miroslaw Malek and G. Jack Lipovski",
  title =        "Resource allocation in rectangular {CC}-banyans",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "11",
  number =       "3",
  pages =        "178--184",
  month =        jun,
  year =         "1983",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:53 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Sovis:1983:UTS,
  author =       "Franti{\v{s}}ek Sovi{\v{s}}",
  title =        "Uniform theory of the shuffle-exchange type
                 permutation networks",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "11",
  number =       "3",
  pages =        "185--191",
  month =        jun,
  year =         "1983",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:53 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Srini:1983:ACA,
  author =       "Vason P. Srini and Jorge F. Asenjo",
  title =        "Analysis of {Cray-1S} architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "11",
  number =       "3",
  pages =        "194--206",
  month =        jun,
  year =         "1983",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:53 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Jordan:1983:PMH,
  author =       "Harry F. Jordan",
  title =        "Performance measurements on {HEP} --- a pipelined
                 {MIMD} computer",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "11",
  number =       "3",
  pages =        "207--212",
  month =        jun,
  year =         "1983",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:53 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Amano:1983:SSM,
  author =       "Hideharu Amano and Takaichi Yoshida and Hideo Aiso",
  title =        "{(SM)2-Sparse Matrix Solving Machine}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "11",
  number =       "3",
  pages =        "213--220",
  month =        jun,
  year =         "1983",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:53 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Krishnan:1983:ESC,
  author =       "R. Kalyana Krishnan and A. K. Rajasekar and C. S.
                 Moghe",
  title =        "An experimental system for {Computer Science}
                 instruction",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "11",
  number =       "3",
  pages =        "222--227",
  month =        jun,
  year =         "1983",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:53 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kronlof:1983:ECM,
  author =       "Klaus Kronl{\"o}f",
  title =        "Execution control and memory management of a {Data
                 Flow Signal Processor}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "11",
  number =       "3",
  pages =        "230--235",
  month =        jun,
  year =         "1983",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:53 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kishi:1983:DDD,
  author =       "Masasuke Kishi and Hiroshi Yasuhara and Yasusuke
                 Kawamura",
  title =        "{DDDP}---a {Distributed Data Driven Processor}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "11",
  number =       "3",
  pages =        "236--242",
  month =        jun,
  year =         "1983",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:53 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Takahashi:1983:DFP,
  author =       "Naohisa Takahashi and Makoto Amamiya",
  title =        "A data flow processor array system: {Design} and
                 analysis",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "11",
  number =       "3",
  pages =        "243--250",
  month =        jun,
  year =         "1983",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:53 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Pier:1983:RDH,
  author =       "Kenneth A. Pier",
  title =        "A retrospective on the {Dorado}, a high-performance
                 personal computer",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "11",
  number =       "3",
  pages =        "252--269",
  month =        jun,
  year =         "1983",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:53 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Dugan:1983:SEA,
  author =       "Robert J. Dugan",
  title =        "{System\slash 370} extended architecture: a program
                 view of the channel subsystem",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "11",
  number =       "3",
  pages =        "270--276",
  month =        jun,
  year =         "1983",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:53 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Norton:1983:AIM,
  author =       "Richard L. Norton and Jacob A. Abraham",
  title =        "Adaptive interpretation as a means of exploiting
                 complex instruction sets",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "11",
  number =       "3",
  pages =        "277--282",
  month =        jun,
  year =         "1983",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:53 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kumar:1983:SSC,
  author =       "Manoj Kumar and Daniel M. Dias and J. R. Jump",
  title =        "Switching strategies in a class of packet switching
                 networks",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "11",
  number =       "3",
  pages =        "284--300",
  month =        jun,
  year =         "1983",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:53 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Wah:1983:CSD,
  author =       "Benjamin W. Wah",
  title =        "A comparative study of distributed resource sharing on
                 multiprocessors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "11",
  number =       "3",
  pages =        "301--308",
  month =        jun,
  year =         "1983",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:53 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Fuchs:1983:CED,
  author =       "W. Kent Fuchs and Jacob A. Abraham and Kuang-Hua
                 Huang",
  title =        "Concurrent error detection in {VLSI} interconnection
                 networks",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "11",
  number =       "3",
  pages =        "309--315",
  month =        jun,
  year =         "1983",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:53 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Giloi:1983:HFD,
  author =       "W. K. Giloi and P. Behr",
  title =        "Hierarchical function distribution --- a design
                 principle for advanced multicomputer architectures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "11",
  number =       "3",
  pages =        "318--325",
  month =        jun,
  year =         "1983",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:53 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Stringa:1983:EIE,
  author =       "Luigi Stringa",
  title =        "{EMMA}-an industrial experience on large
                 multiprocessing architectures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "11",
  number =       "3",
  pages =        "326--333",
  month =        jun,
  year =         "1983",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:53 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Philipson:1983:CSM,
  author =       "Lars Philipson and Bo Nilsson and Bjorn Breidegard",
  title =        "A communication structure for a multiprocessor
                 computer with distributed global memory",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "11",
  number =       "3",
  pages =        "334--340",
  month =        jun,
  year =         "1983",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:53 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hayashi:1983:AHP,
  author =       "Hiromu Hayashi and Akira Hattori and Haruo Akimoto",
  title =        "{ALPHA}---a high-performance {LISP} machine equipped
                 with a new stack structure and garbage collection
                 system",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "11",
  number =       "3",
  pages =        "342--348",
  month =        jun,
  year =         "1983",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:53 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Umeyama:1983:PEM,
  author =       "Shinji Umeyama and Koichiro Tamura",
  title =        "A parallel execution model of logic programs",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "11",
  number =       "3",
  pages =        "349--355",
  month =        jun,
  year =         "1983",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:53 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Schmittgen:1983:SAC,
  author =       "Claudia Schmittgen and Werner Kluge",
  title =        "A system architecture for the concurrent evaluation of
                 applicative program expressions",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "11",
  number =       "3",
  pages =        "356--362",
  month =        jun,
  year =         "1983",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:53 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Yamaguchi:1983:PEL,
  author =       "Yoshinori Yamaguchi and Kenji Toda and Toshitsugu
                 Yuba",
  title =        "A performance evaluation of a {Lisp}-based data-driven
                 machine {(EM-3)}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "11",
  number =       "3",
  pages =        "363--369",
  month =        jun,
  year =         "1983",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:53 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Tanimoto:1983:PAP,
  author =       "Steven L. Tanimoto",
  title =        "A pyramidal approach to parallel processing",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "11",
  number =       "3",
  pages =        "372--378",
  month =        jun,
  year =         "1983",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:53 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Gaillat:1983:DPP,
  author =       "G{\'e}rard Gaillat",
  title =        "The design of a parallel processor for image
                 processing on-board satellites: an application oriented
                 approach",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "11",
  number =       "3",
  pages =        "379--386",
  month =        jun,
  year =         "1983",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:53 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Nishimura:1983:LPP,
  author =       "Hitoshi Nishimura and Hiroshi Ohno and Toru Kawata and
                 Isao Shirakawa and Koichi Omura",
  title =        "{Links-1} --- a parallel pipelined multimicrocomputer
                 system for image creation",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "11",
  number =       "3",
  pages =        "387--394",
  month =        jun,
  year =         "1983",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:53 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ericsson:1983:LSM,
  author =       "T. Ericsson and P. E. Danielsson",
  title =        "{LIPP} --- a {SIMD} multiprocessor architecture for
                 image processing",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "11",
  number =       "3",
  pages =        "395--400",
  month =        jun,
  year =         "1983",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:53 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Treleaven:1983:NGC,
  author =       "Philip C. Treleaven",
  title =        "The new generation of computer architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "11",
  number =       "3",
  pages =        "402--409",
  month =        jun,
  year =         "1983",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:53 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Uchida:1983:IMS,
  author =       "Shunichi Uchida",
  title =        "Inference machine: {From} sequential to parallel",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "11",
  number =       "3",
  pages =        "410--416",
  month =        jun,
  year =         "1983",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:53 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Moto-oka:1983:OFG,
  author =       "Tohru Moto-oka",
  title =        "Overview to the {Fifth Generation Computer System}
                 project",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "11",
  number =       "3",
  pages =        "417--422",
  month =        jun,
  year =         "1983",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:53 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Murakami:1983:RDB,
  author =       "Kunio Murakami and Takeo Kakuta and Nobuyoshi Miyazaki
                 and Shigeki Shibayama and Haruo Yokota",
  title =        "A relational data base machine: {First} step to
                 knowledge base machine",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "11",
  number =       "3",
  pages =        "423--425",
  month =        jun,
  year =         "1983",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:53 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Arvind:1983:CMN,
  author =       "Arvind and Robert A. Iannucci",
  title =        "A critique of multiprocessing {von Neumann} style",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "11",
  number =       "3",
  pages =        "426--436",
  month =        jun,
  year =         "1983",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:53 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hill:1983:ACM,
  author =       "Dwight D. Hill",
  title =        "An analysis of {C} machine support for other
                 block-structured languages",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "11",
  number =       "4",
  pages =        "6--16",
  month =        sep,
  year =         "1983",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:10 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Marovac:1983:IID,
  author =       "Nenad Marovac",
  title =        "On interprocess interaction in distributed
                 architectures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "11",
  number =       "4",
  pages =        "17--22",
  month =        sep,
  year =         "1983",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:10 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Schalkoff:1983:TED,
  author =       "Robert J. Schalkoff",
  title =        "Towards an efficient, dedicated architecture for a
                 {Digital Geometric Image Transformer (DGIT)}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "11",
  number =       "4",
  pages =        "23--29",
  month =        sep,
  year =         "1983",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:10 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Plotkin:1983:TSA,
  author =       "Arieh Plotkin and Daniel Tabak",
  title =        "A {Tree Structured Architecture} for semantic gap
                 reduction",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "11",
  number =       "4",
  pages =        "30--44",
  month =        sep,
  year =         "1983",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:10 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Wilkes:1983:KJI,
  author =       "Maurice V. Wilkes",
  title =        "Keeping jump instructions out of the pipeline of a
                 {RISC}-like computer",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "11",
  number =       "5",
  pages =        "5--7",
  month =        dec,
  year =         "1983",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:17 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Jones:1983:PM,
  author =       "Jeremy Jones",
  title =        "Puzzling with microcode",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "11",
  number =       "5",
  pages =        "8--12",
  month =        dec,
  year =         "1983",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:17 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Amsbury:1983:CSA,
  author =       "Wayne Amsbury",
  title =        "A code-splitting algorithm",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "11",
  number =       "5",
  pages =        "13--21",
  month =        dec,
  year =         "1983",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:17 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Dongarra:1983:PVC,
  author =       "Jack J. Dongarra",
  title =        "Performance of various computers using standard linear
                 equations software in a {Fortran} environment",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "11",
  number =       "5",
  pages =        "22--27",
  month =        dec,
  year =         "1983",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:17 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Bhujade:1983:DAC,
  author =       "M. R. Bhujade",
  title =        "On the design of {Always Compatible Instruction Set
                 Architecture(ACISA)}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "11",
  number =       "5",
  pages =        "28--30",
  month =        dec,
  year =         "1983",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:17 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Heath:1984:RER,
  author =       "J. L. Heath",
  title =        "Re-evaluation of the {RISC I}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "12",
  number =       "1",
  pages =        "3--10",
  month =        mar,
  year =         "1984",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:31 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Patterson:1984:RW,
  author =       "David A. Patterson",
  title =        "{RISC} watch",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "12",
  number =       "1",
  pages =        "11--19",
  month =        mar,
  year =         "1984",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:31 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Beeler:1984:BBB,
  author =       "Michael Beeler",
  title =        "Beyond the {Baskett} benchmark",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "12",
  number =       "1",
  pages =        "20--31",
  month =        mar,
  year =         "1984",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:31 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Feustel:1984:PEP,
  author =       "Edward A. Feustel",
  title =        "Process exchange on the {PR1ME} family of computers",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "12",
  number =       "1",
  pages =        "32--43",
  month =        mar,
  year =         "1984",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:31 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Fenwick:1984:AOA,
  author =       "P. M. Fenwick",
  title =        "Addressing operations for automatic data structure
                 accessing",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "12",
  number =       "1",
  pages =        "44--57",
  month =        mar,
  year =         "1984",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:31 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Yuen:1984:SAI,
  author =       "C. K. Yuen",
  title =        "Some applications of the implicit register reference",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "12",
  number =       "1",
  pages =        "58--63",
  month =        mar,
  year =         "1984",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:31 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kavi:1984:AQ,
  author =       "Krishna M. Kavi and K. Krishnamohan",
  title =        "Architecture quality",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "12",
  number =       "1",
  pages =        "64--72",
  month =        mar,
  year =         "1984",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:31 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Agrawal:1984:BHH,
  author =       "Dharma P. Agrawal and Winser E. Alexander",
  title =        "{B-HIVE}: a heterogeneous, interconnected, versatile
                 and expandable multicomputer system",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "12",
  number =       "2",
  pages =        "7--13",
  month =        jun,
  year =         "1984",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Burkowski:1984:VAM,
  author =       "F. J. Burkowski",
  title =        "A vector and array multiprocessor extension of the
                 sylvan architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "12",
  number =       "3",
  pages =        "4--11",
  month =        jun,
  year =         "1984",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kapauan:1984:PPC,
  author =       "Alejandro Kapauan and J. Timothy Field and Dennis B.
                 Gannon and Lawrence Snyder",
  title =        "The {Pringle} parallel computer",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "12",
  number =       "3",
  pages =        "12--20",
  month =        jun,
  year =         "1984",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Yasrebi:1984:SAS,
  author =       "Mehrad Yasrebi and G. J. Lipovski",
  title =        "A state-of-the-art {SIMD} two-dimensional {FFT} array
                 processor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "12",
  number =       "3",
  pages =        "21--27",
  month =        jun,
  year =         "1984",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ma:1984:ARS,
  author =       "Y. W. Ma and R. Krishnamurti",
  title =        "The architecture of {Replica}: a special-purpose
                 computer system for active multi-sensory perception of
                 $3$-dimensional objects",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "12",
  number =       "3",
  pages =        "30--37",
  month =        jun,
  year =         "1984",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Goldwasser:1984:GOD,
  author =       "Samuel M. Goldwasser",
  title =        "A generalized object display processor architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "12",
  number =       "3",
  pages =        "38--47",
  month =        jun,
  year =         "1984",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kawakami:1984:SPL,
  author =       "Katsura Kawakami and Shigeo Shimazaki",
  title =        "A special purpose {LSI} processor using the {DDA}
                 algorithm for image transformation",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "12",
  number =       "3",
  pages =        "48--54",
  month =        jun,
  year =         "1984",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Wah:1984:SMM,
  author =       "Benjamin W. Wah and Guo-Jie Li and Chee-Fen Yu",
  title =        "The status of {MANIP} --- a multicomputer architecture
                 for solving, combinatorial extremum-search problems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "12",
  number =       "3",
  pages =        "56--63",
  month =        jun,
  year =         "1984",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Gonzalez-Rubio:1984:SFP,
  author =       "R. Gonzalez-Rubio and J. Rohmer and D. Terral",
  title =        "The {SCHUSS} filter: a processor for non-numerical
                 data processing",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "12",
  number =       "3",
  pages =        "64--73",
  month =        jun,
  year =         "1984",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ebeling:1984:DIV,
  author =       "Carl Ebeling and Andrew Palay",
  title =        "The design and implementation of a {VLSI} chess move
                 generator",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "12",
  number =       "3",
  pages =        "74--80",
  month =        jun,
  year =         "1984",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Lee:1984:PAC,
  author =       "Manjai Lee and Chuan-lin Wu",
  title =        "Performance analysis of circuit switching, baseline
                 interconnection networks",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "12",
  number =       "3",
  pages =        "82--90",
  month =        jun,
  year =         "1984",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kruskal:1984:IBS,
  author =       "Clyde P. Kruskal and Marc Snir",
  title =        "The importance of being square",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "12",
  number =       "3",
  pages =        "91--98",
  month =        jun,
  year =         "1984",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Chin:1984:CPM,
  author =       "Chi-Yuan Chin and Kai Hwang",
  title =        "Connection principles for multipath, packet switching
                 networks",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "12",
  number =       "3",
  pages =        "99--108",
  month =        jun,
  year =         "1984",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Weiss:1984:IIL,
  author =       "Shlomo Weiss and James E. Smith",
  title =        "Instruction issue logic for pipelined supercomputers",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "12",
  number =       "3",
  pages =        "110--118",
  month =        jun,
  year =         "1984",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Wedig:1984:RBI,
  author =       "Robert G. Wedig and Marc A. Rose",
  title =        "The reduction of branch instruction execution overhead
                 using structured control flow",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "12",
  number =       "3",
  pages =        "119--125",
  month =        jun,
  year =         "1984",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Banerjee:1984:FEL,
  author =       "Utpal Banerjee and Daniel D. Gajski",
  title =        "Fast execution of loops with if statements",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "12",
  number =       "3",
  pages =        "126--132",
  month =        jun,
  year =         "1984",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Gajski:1984:PPR,
  author =       "Daniel Gajski and Won Kim and Shinya Fushimi",
  title =        "A parallel pipelined relational query processor: an
                 architectural overview",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "12",
  number =       "3",
  pages =        "134--141",
  month =        jun,
  year =         "1984",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Somani:1984:EVD,
  author =       "Arun K. Somani and Vinod K. Agarwal",
  title =        "An efficient {VLSI} dictionary machine",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "12",
  number =       "3",
  pages =        "142--150",
  month =        jun,
  year =         "1984",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Fisher:1984:DMS,
  author =       "Allan L. Fisher",
  title =        "Dictionary machines with a small number of
                 processors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "12",
  number =       "3",
  pages =        "151--156",
  month =        jun,
  year =         "1984",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hill:1984:EEC,
  author =       "Mark D. Hill and Alan Jay Smith",
  title =        "Experimental evaluation of on-chip microprocessor
                 cache memories",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "12",
  number =       "3",
  pages =        "158--166",
  month =        jun,
  year =         "1984",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Goodman:1984:USC,
  author =       "James R. Goodman and Men-chow Chiang",
  title =        "The use of static column {RAM} as a memory hierarchy",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "12",
  number =       "3",
  pages =        "167--173",
  month =        jun,
  year =         "1984",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Haikala:1984:CHRa,
  author =       "I. J. Haikala",
  title =        "Cache hit ratios with geometric task switch
                 intervals",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "12",
  number =       "3",
  pages =        "175--175",
  month =        jun,
  year =         "1984",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ishikawa:1984:DOO,
  author =       "Yutaka Ishikawa and Mario Tokoro",
  title =        "The design of an object oriented architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "12",
  number =       "3",
  pages =        "178--187",
  month =        jun,
  year =         "1984",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ungar:1984:ASS,
  author =       "David Ungar and Ricki Blau and Peter Foley and Dain
                 Samples and David Patterson",
  title =        "Architecture of {SOAR}: {Smalltalk} on a {RISC}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "12",
  number =       "3",
  pages =        "188--197",
  month =        jun,
  year =         "1984",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Bose:1984:DIS,
  author =       "Pradip Bose and Edward S. Davidson",
  title =        "Design of instruction set architectures for support of
                 high-level languages",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "12",
  number =       "3",
  pages =        "198--206",
  month =        jun,
  year =         "1984",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Quinton:1984:ASS,
  author =       "Patrice Quinton",
  title =        "Automatic synthesis of systolic arrays from uniform
                 recurrent equations",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "12",
  number =       "3",
  pages =        "208--214",
  month =        jun,
  year =         "1984",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Zhang:1984:MDS,
  author =       "Chang nian Zhang and David Y. Y. Yun",
  title =        "Multi-dimensional systolic networks, for {Discrete
                 Fourier Transform}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "12",
  number =       "3",
  pages =        "215--222",
  month =        jun,
  year =         "1984",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Fortes:1984:DBL,
  author =       "J. A. B. Fortes and D. I. Moldovan",
  title =        "Data broadcasting in linearly scheduled array
                 processors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "12",
  number =       "3",
  pages =        "224--231",
  month =        jun,
  year =         "1984",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ramakrishnan:1984:MMM,
  author =       "I. V. Ramakrishnan and P. J. Varman",
  title =        "Modular matrix multiplication on a linear array",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "12",
  number =       "3",
  pages =        "232--238",
  month =        jun,
  year =         "1984",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Rao:1984:JEE,
  author =       "T. R. N. Rao",
  title =        "Joint encryption and error correction schemes",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "12",
  number =       "3",
  pages =        "240--241",
  month =        jun,
  year =         "1984",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Bose:1984:UEC,
  author =       "Bella Bose",
  title =        "Unidirectional error correction\slash detection for
                 {VLSI} memory",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "12",
  number =       "3",
  pages =        "242--244",
  month =        jun,
  year =         "1984",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Chen:1984:ECC,
  author =       "C. L. Chen",
  title =        "Error-correcting codes for semiconductor memories",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "12",
  number =       "3",
  pages =        "245--247",
  month =        jun,
  year =         "1984",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ghaffar:1984:SEC,
  author =       "Khaled Abdel Ghaffar and Robert J. McEliece",
  title =        "Soft error correction for increased densities in
                 {VLSI} memories",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "12",
  number =       "3",
  pages =        "248--250",
  month =        jun,
  year =         "1984",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{King:1984:CSA,
  author =       "Richard M. King and Robert A. Wagner",
  title =        "Combining speed with alpha-particle induced memory,
                 error tolerance in a large {Boolean} vector machine",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "12",
  number =       "3",
  pages =        "251--253",
  month =        jun,
  year =         "1984",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Bhuyan:1984:PLC,
  author =       "Laxmi N. Bhuyan",
  title =        "On the performance of loosely coupled
                 multiprocessors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "12",
  number =       "3",
  pages =        "256--262",
  month =        jun,
  year =         "1984",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Mehrotra:1984:STD,
  author =       "Ravi Mehrotra and Sarosh N. Talukdar",
  title =        "Scheduling of tasks for distributed processors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "12",
  number =       "3",
  pages =        "263--270",
  month =        jun,
  year =         "1984",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kavi:1984:MRD,
  author =       "Krishna M. Kavi and Edward W. Banios and Bruce D.
                 Shriver",
  title =        "Message repository definitional facility: an
                 architectural model for interprocess communication",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "12",
  number =       "3",
  pages =        "271--278",
  month =        jun,
  year =         "1984",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Banerjee:1984:FSA,
  author =       "Prithviraj Banerjee and Jacob A. Abraham",
  title =        "Fault-secure algorithms for multiple-processor
                 systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "12",
  number =       "3",
  pages =        "279--287",
  month =        jun,
  year =         "1984",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Bic:1984:ELP,
  author =       "Lubomir Bic",
  title =        "Execution of logic programs on a dataflow
                 architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "12",
  number =       "3",
  pages =        "290--296",
  month =        jun,
  year =         "1984",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Rudd:1984:HPF,
  author =       "W. G. Rudd and Duncan A. Buell and Donald M.
                 Chiarulli",
  title =        "A high performance factoring machine",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "12",
  number =       "3",
  pages =        "297--300",
  month =        jun,
  year =         "1984",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Emer:1984:CPP,
  author =       "Joel S. Emer and Douglas W. Clark",
  title =        "A characterization of processor performance in the
                 {VAX-11\slash 780}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "12",
  number =       "3",
  pages =        "301--310",
  month =        jun,
  year =         "1984",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Moeller:1984:PPP,
  author =       "W. D. Moeller and G. Sandweg",
  title =        "The peripheral processor {PP4}, a highly regular
                 {VLSI} processor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "12",
  number =       "3",
  pages =        "312--318",
  month =        jun,
  year =         "1984",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Philipson:1984:VBD,
  author =       "Lars Philipson",
  title =        "{VLSI} based design principles for {MIMD}
                 multiprocessor computers with distributed memory
                 management",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "12",
  number =       "3",
  pages =        "319--327",
  month =        jun,
  year =         "1984",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Samatham:1984:MNS,
  author =       "M. R. Samatham and D. K. Pradhan",
  title =        "A multiprocessor network suitable for single-chip
                 {VLSI} implementation",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "12",
  number =       "3",
  pages =        "328--339",
  month =        jun,
  year =         "1984",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Rudolph:1984:DDC,
  author =       "Larry Rudolph and Zary Segall",
  title =        "Dynamic decentralized cache schemes for {MIMD}
                 parallel processors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "12",
  number =       "3",
  pages =        "340--347",
  month =        jun,
  year =         "1984",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Papamarcos:1984:LOC,
  author =       "Mark S. Papamarcos and Janak H. Patel",
  title =        "A low-overhead coherence solution for multiprocessors
                 with private cache memories",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "12",
  number =       "3",
  pages =        "348--354",
  month =        jun,
  year =         "1984",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Archibald:1984:ESC,
  author =       "James Archibald and Jean Loup Baer",
  title =        "An economical solution to the cache coherence
                 problem",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "12",
  number =       "3",
  pages =        "355--362",
  month =        jun,
  year =         "1984",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Haikala:1984:CHRb,
  author =       "Ilkka J. Haikala",
  title =        "Cache hit ratios with geometric task switch
                 intervals",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "12",
  number =       "3",
  pages =        "364--371",
  month =        jun,
  year =         "1984",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Chesley:1984:WM,
  author =       "Gilman D. Chesley",
  title =        "A wafer microcomputer",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "12",
  number =       "4",
  pages =        "4--6",
  month =        sep,
  year =         "1984",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:10 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Siegel:1984:PRP,
  author =       "Howard Jay Siegel and Thomas Schwederski and Nathaniel
                 J. {Davis IV} and James T. Kuehn",
  title =        "{PASM}: a reconfigurable parallel system for image
                 processing",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "12",
  number =       "4",
  pages =        "7--19",
  month =        sep,
  year =         "1984",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:10 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Aslam:1984:MDC,
  author =       "Javaid Aslam",
  title =        "Methodology for designing a computer architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "12",
  number =       "5",
  pages =        "4--11",
  month =        dec,
  year =         "1984",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:18 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Graham:1984:PAS,
  author =       "Peter C. J. Graham",
  title =        "Providing architectural support for expert systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "12",
  number =       "5",
  pages =        "12--18",
  month =        dec,
  year =         "1984",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:18 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Dongarra:1985:PVC,
  author =       "Jack J. Dongarra",
  title =        "Performance of various computers using standard linear
                 equations software in a {Fortran} environment",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "13",
  number =       "1",
  pages =        "3--11",
  month =        mar,
  year =         "1985",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1296930.1296931",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 12:06:22 MDT 2008",
  bibsource =    "ftp://ftp.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "This note compares the performance of different
                 computer systems while solving dense systems of linear
                 equations using the LINPACK software in a Fortran
                 environment. About 100 computers, ranging from a CRAY
                 X-MP to the 68000 based systems such as the Apollo and
                 SUN Workstations to IBM PC's, are compared.",
  acknowledgement = ack-nhfb,
  classcodes =   "C4140 (Linear algebra); C5470 (Performance evaluation
                 and testing); C7310 (Mathematics computing)",
  corpsource =   "Div. of Math. and Comput. Sci., Argonne Nat. Lab., IL,
                 USA",
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "68000 based; Apollo workstations; Cray X-MP; dense
                 systems; evaluation; FORTRAN environment; IBM PCs;
                 linear algebra; linear equations; LINPACK; performance;
                 performance comparison; performance evaluation;
                 software; Sun Workstations; systems",
  treatment =    "X Experimental",
}

@Article{Hor:1985:DPP,
  author =       "T. M. Hor and C. K. Yuen",
  title =        "The design and programming of a powerful short
                 wordlength processor using context-dependent machine
                 instructions",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "13",
  number =       "1",
  pages =        "12--26",
  month =        mar,
  year =         "1985",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1296930.1296932",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 12:06:22 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Context-dependent machine instructions were used to
                 extend the capability of instruction set of a short
                 wordlength processor. By freeing instruction bits for
                 other purposes, a more powerful machine instruction set
                 can be designed. Programming examples were given to
                 illustrate the benefit obtained from the design. Less
                 CPU time and memory space were required as compared
                 with popular 8-bit CPUs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Miya:1985:MDP,
  author =       "E. N. Miya",
  title =        "Multiprocessor\slash distributed processing
                 bibliography (in machine-readable form)",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "13",
  number =       "1",
  pages =        "27--29",
  month =        mar,
  year =         "1985",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1296930.1296933",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 12:06:22 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "There is a lot of renewed interest in parallel
                 processing. People parallel process, too. Human
                 parallel processing tends to be cooperative rather than
                 competitive. To this end, research literature uses
                 bibliographies like road-maps to the field.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "annotated bibliography; cellular automata; computer
                 system architecture; fault-tolerant computers;
                 multicomputers; multiprocessor software; networks;
                 operating systems; parallel algorithms; parallel
                 processing; programming languages; supercomputers;
                 vector processing",
}

@Article{Hu:1985:DAE,
  author =       "Weiming Hu",
  title =        "Dataflow architecture for {EEG} patient monitor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "13",
  number =       "2",
  pages =        "3--10",
  month =        jun,
  year =         "1985",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1296935.1296936",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 12:06:26 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Much work is currently directed towards dataflow
                 architectures. Most of the proposed architectures
                 attempt to exploit fine grained parallelism. This paper
                 describes an application specific dataflow architecture
                 which exploits coarse grained parallelism. The
                 application is that of a real-time patient monitor used
                 to display patient data.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Tagg:1985:SEA,
  author =       "A. G. Tagg",
  title =        "Speculations on the evolution of an architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "13",
  number =       "2",
  pages =        "11--18",
  month =        jun,
  year =         "1985",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1296935.1296937",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 12:06:26 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "PRIME computers was formed in the early 1970s by a
                 splinter group of hardware and software engineers from
                 Honeywell. With them, they brought their ideas on
                 minicomputers, based on their experience of Honeywell
                 minis, and their experience of the MULTICS operating
                 system.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Randell:1985:HST,
  author =       "Brian Randell",
  title =        "Hardware\slash software tradeoffs: a general design
                 principle?",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "13",
  number =       "2",
  pages =        "19--21",
  month =        jun,
  year =         "1985",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1296935.1296938",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 12:06:26 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Hardware and software are logically equivalent. Any
                 operation performed by software can also be built
                 directly into the hardware and any instruction executed
                 by the hardware can also be simulated in software. The
                 decision to put certain features in hardware and others
                 in software is based on such factors as cost, speed,
                 reliability and frequency of change. There are no hard
                 and fast rules to the effect that X must go into the
                 hardware and Y must be programmed explicitly. Designers
                 with different goals may, and often do, make different
                 decisions\ldots{} the boundary between hardware and
                 software is arbitrary and constantly changing. Today's
                 software is tomorrow's hardware, and vice versa. [1]",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kumar:1985:APM,
  author =       "V. K. Prasanna Kumar and C. S. Raghavendra",
  title =        "Array processor with multiple broadcasting",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "13",
  number =       "3",
  pages =        "2--10",
  month =        jun,
  year =         "1985",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Wolf:1985:MMI,
  author =       "G. Wolf and J. R. Jump",
  title =        "Matrix multiplication in an interleaved array
                 processing architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "13",
  number =       "3",
  pages =        "11--17",
  month =        jun,
  year =         "1985",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Goodman:1985:PVD,
  author =       "J. R. Goodman and Jian-tu Hsieh and Koujuch Liou and
                 Andrew R. Pleszkun and P. B. Schechter and Honesty C.
                 Young",
  title =        "{PIPE}: a {VLSI} decoupled architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "13",
  number =       "3",
  pages =        "20--27",
  month =        jun,
  year =         "1985",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hsu:1985:TST,
  author =       "Peter Y. T. Hsu and Joseph T. Rahmeh and Edward S.
                 Davidson and Jacob A. Abraham",
  title =        "{TIDBITS}: speedup via time-delay bit-slicing in {ALU}
                 design for {VLSI} technology",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "13",
  number =       "3",
  pages =        "29--35",
  month =        jun,
  year =         "1985",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Smith:1985:IPI,
  author =       "James E. Smith and Andrew R. Pleszkun",
  title =        "Implementation of precise interrupts in pipelined
                 processors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "13",
  number =       "3",
  pages =        "36--44",
  month =        jun,
  year =         "1985",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Schwetman:1985:CPP,
  author =       "Herb Schwetman and Daniel Gajski and Dennis Gannon and
                 Daniel Hills and Jacob Schwartz and James Browne",
  title =        "Classification of parallel processor architectures
                 (invited tutorial session)",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "13",
  number =       "3",
  pages =        "45--45",
  month =        jun,
  year =         "1985",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hasegawa:1985:HST,
  author =       "Makoto Hasegawa and Yoshiharu Shigei",
  title =        "High-speed top-of-stack scheme for {VLSI} processor: a
                 management algorithm and its analysis",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "13",
  number =       "3",
  pages =        "48--54",
  month =        jun,
  year =         "1985",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hitchcock:1985:AMR,
  author =       "Charles Y. {Hitchcock III} and H. M. Brinkley Sprunt",
  title =        "Analyzing multiple register sets",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "13",
  number =       "3",
  pages =        "55--63",
  month =        jun,
  year =         "1985",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Smith:1985:CEI,
  author =       "Alan Jay Smith",
  title =        "Cache evaluation and the impact of workload choice",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "13",
  number =       "3",
  pages =        "64--73",
  month =        jun,
  year =         "1985",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Moon:1985:AS,
  author =       "David A. Moon",
  title =        "Architecture of the {Symbolics 3600}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "13",
  number =       "3",
  pages =        "76--83",
  month =        jun,
  year =         "1985",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ram:1985:PGC,
  author =       "Ashwin Ram and Janak H. Patel",
  title =        "Parallel garbage collection without synchronization
                 overhead",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "13",
  number =       "3",
  pages =        "84--90",
  month =        jun,
  year =         "1985",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Sohi:1985:ELE,
  author =       "Gurindar S. Sohi and Edward S. Davidson and Janak H.
                 Patel",
  title =        "An efficient {LISP}-execution architecture with a new
                 representation for list structures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "13",
  number =       "3",
  pages =        "91--98",
  month =        jun,
  year =         "1985",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Amano:1985:SIN,
  author =       "Hideharu Amano and Taisuke Boku and Tomohiro Kudoh and
                 Hideo Aiso",
  title =        "{(SM)2-II}: a new version of the sparse matrix solving
                 machine",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "13",
  number =       "3",
  pages =        "100--107",
  month =        jun,
  year =         "1985",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Beetem:1985:GS,
  author =       "John Beetem and Monty Denneau and Don Weingarten",
  title =        "The {GF11} supercomputer",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "13",
  number =       "3",
  pages =        "108--115",
  month =        jun,
  year =         "1985",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Smith:1985:MUD,
  author =       "Bradley Warren Smith and Howard Jay Siegel",
  title =        "Models for use in the design of macro-pipelined
                 parallel processors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "13",
  number =       "3",
  pages =        "116--123",
  month =        jun,
  year =         "1985",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Edler:1985:IRM,
  author =       "Jan Edler and Allan Gottlieb and Clyde P. Kruskal and
                 Kevin P. McAuliffe and Larry Rudolph and Marc Snir and
                 Patricia J. Teller and James Wilson",
  title =        "Issues related to {MIMD} shared-memory computers: the
                 {NYU Ultracomputer} approach",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "13",
  number =       "3",
  pages =        "126--135",
  month =        jun,
  year =         "1985",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ibbett:1985:MPV,
  author =       "R. N. Ibbett and P. C. Capon and N. P. Topham",
  title =        "{MU6V}: a parallel vector processing system",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "13",
  number =       "3",
  pages =        "136--144",
  month =        jun,
  year =         "1985",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Lundstrom:1985:DCH,
  author =       "Stephen F. Lundstrom",
  title =        "A decentralized control, highly concurrent
                 multiprocessor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "13",
  number =       "3",
  pages =        "145--151",
  month =        jun,
  year =         "1985",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Dally:1985:OOA,
  author =       "William J. Dally and James T. Kajiya",
  title =        "An object oriented architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "13",
  number =       "3",
  pages =        "154--161",
  month =        jun,
  year =         "1985",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Gehringer:1985:TAH,
  author =       "Edward F. Gehringer and J. Leslie Keedy",
  title =        "Tagged architecture: how compelling are its
                 advantages?",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "13",
  number =       "3",
  pages =        "162--170",
  month =        jun,
  year =         "1985",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Nanba:1985:VAV,
  author =       "S. Nanba and N. Ohno and H. Kubo and H. Morisue and T.
                 Ohshima and H. Yamagishi",
  title =        "{VM\slash 4}: {ACOS-4} virtual machine architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "13",
  number =       "3",
  pages =        "171--178",
  month =        jun,
  year =         "1985",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Dobry:1985:PSP,
  author =       "T. P. Dobry and A. M. Despain and Y. N. Patt",
  title =        "Performance studies of a {Prolog} machine
                 architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "13",
  number =       "3",
  pages =        "180--190",
  month =        jun,
  year =         "1985",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Nakazaki:1985:DHS,
  author =       "Ryosei Nakazaki and Akihiko Konagaya and Shin'ichi
                 Habata and Hideo Shimazu and Mamoru Umemutra and
                 Masahiro Yamamoto and Minoru Yokota and Takashi
                 Chikayama",
  title =        "Design of a high-speed {Prolog} machine {(HPM)}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "13",
  number =       "3",
  pages =        "191--197",
  month =        jun,
  year =         "1985",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Woo:1985:HUU,
  author =       "Nam Sung Woo",
  title =        "A hardware unification unit: design and analysis",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "13",
  number =       "3",
  pages =        "198--205",
  month =        jun,
  year =         "1985",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Matelan:1985:FM,
  author =       "Nicholas Matelan",
  title =        "The {FLEX\slash 32} multicomputer",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "13",
  number =       "3",
  pages =        "209--213",
  month =        jun,
  year =         "1985",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Rattner:1985:CMT,
  author =       "J. Rattner",
  title =        "Commercial multiprocessors (title only)",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "13",
  number =       "3",
  pages =        "214--214",
  month =        jun,
  year =         "1985",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Naedel:1985:CCA,
  author =       "Dick Naedel",
  title =        "Closely coupled asynchronous hierarchical and parallel
                 processing in an open architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "13",
  number =       "3",
  pages =        "215--220",
  month =        jun,
  year =         "1985",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Savage:1985:PPL,
  author =       "Jim Savage",
  title =        "Parallel processing as a language design problem",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "13",
  number =       "3",
  pages =        "221--224",
  month =        jun,
  year =         "1985",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Rodgers:1985:IMS,
  author =       "David P. Rodgers",
  title =        "Improvements in multiprocessor system design",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "13",
  number =       "3",
  pages =        "225--231",
  month =        jun,
  year =         "1985",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Mark:1985:SCF,
  author =       "Peter B. Mark",
  title =        "The {Sequoia} computer: a fault-tolerant
                 tightly-coupled multiprocessor architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "13",
  number =       "3",
  pages =        "232--232",
  month =        jun,
  year =         "1985",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Nestle:1985:SNS,
  author =       "Elliot Nestle and Armond Inselberg",
  title =        "The {SYNAPSE N+1 System}: architectural
                 characteristics and performance data of a
                 tightly-coupled multiprocessor system",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "13",
  number =       "3",
  pages =        "233--239",
  month =        jun,
  year =         "1985",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Horst:1985:AHV,
  author =       "Robert W. Horst and Timothy C. K. Chou",
  title =        "An architecture for high volume transaction
                 processing",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "13",
  number =       "3",
  pages =        "240--245",
  month =        jun,
  year =         "1985",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Stone:1985:FGC,
  author =       "Harold Stone and Eric Manning and Harriet Rigas and
                 Philip Treleaven",
  title =        "The fifth generation computer systems projects
                 (invited session)",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "13",
  number =       "3",
  pages =        "247--247",
  month =        jun,
  year =         "1985",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kamiya:1985:HPA,
  author =       "Shigeo Kamiya and Susumu Matsuda and Kazuhide Iwata
                 and Shigeki Shibayama and Hiroshi Sakai and Kunio
                 Murakami",
  title =        "A hardware pipeline algorithm for relational database
                 operation",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "13",
  number =       "3",
  pages =        "250--257",
  month =        jun,
  year =         "1985",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Lee:1985:DMR,
  author =       "Dik Lun Lee",
  title =        "A distributed multiple-response resolver for
                 value-order retrieval",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "13",
  number =       "3",
  pages =        "258--265",
  month =        jun,
  year =         "1985",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Feo:1985:DDR,
  author =       "John Feo and Roy Jenevein and J. C. Browne",
  title =        "Dynamic, distributed resource configuration on
                 {SW}-banyans",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "13",
  number =       "3",
  pages =        "268--275",
  month =        jun,
  year =         "1985",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Katz:1985:ICC,
  author =       "R. H. Katz and S. J. Eggers and D. A. Wood and C. L.
                 Perkins and R. G. Sheldon",
  title =        "Implementing a cache consistency protocol",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "13",
  number =       "3",
  pages =        "276--283",
  month =        jun,
  year =         "1985",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Li:1985:TRS,
  author =       "Zhiyuan Li and Walid Abu-Sufah",
  title =        "A technique for reducing synchronization overhead in
                 large scale multiprocessors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "13",
  number =       "3",
  pages =        "284--291",
  month =        jun,
  year =         "1985",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Whitby-Strevens:1985:T,
  author =       "Colin Whitby-Strevens",
  title =        "The transputer",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "13",
  number =       "3",
  pages =        "292--300",
  month =        jun,
  year =         "1985",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hurson:1985:SMU,
  author =       "A. R. Hurson and B. Shirazi",
  title =        "A systolic multiplier unit and its {VLSI} design",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "13",
  number =       "3",
  pages =        "302--309",
  month =        jun,
  year =         "1985",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Melhem:1985:LSS,
  author =       "Rami Melhem",
  title =        "A language for the simulation of systolic
                 architectures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "13",
  number =       "3",
  pages =        "310--314",
  month =        jun,
  year =         "1985",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Chuang:1985:VSA,
  author =       "Henry Y. H. Chuang and Guo He",
  title =        "A versatile systolic array for matrix computations",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "13",
  number =       "3",
  pages =        "315--322",
  month =        jun,
  year =         "1985",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Vedder:1985:HDF,
  author =       "Rex Vedder and Dennis Finn",
  title =        "The {Hughes Data Flow Multiprocessor}: architecture
                 for efficient signal and data processing",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "13",
  number =       "3",
  pages =        "324--332",
  month =        jun,
  year =         "1985",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Traub:1985:APG,
  author =       "Kenneth R. Traub",
  title =        "An abstract parallel graph reduction machine",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "13",
  number =       "3",
  pages =        "333--341",
  month =        jun,
  year =         "1985",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Preiss:1985:DFQ,
  author =       "Bruno R. Preiss and V. C. Hamacher",
  title =        "Data flow on a queue machine",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "13",
  number =       "3",
  pages =        "342--351",
  month =        jun,
  year =         "1985",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Gaudiot:1985:MHS,
  author =       "J. L. Gaudiot",
  title =        "Methods for handling structures in data-flow systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "13",
  number =       "3",
  pages =        "352--358",
  month =        jun,
  year =         "1985",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Samatham:1985:BMN,
  author =       "M. R. Samatham and D. K. Pradhan",
  title =        "The {de Bruijn} multiprocessor network: a versatile
                 sorting network",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "13",
  number =       "3",
  pages =        "360--367",
  month =        jun,
  year =         "1985",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Tzeng:1985:FTS,
  author =       "Nian-Feng Tzeng and Pen-Chung Yew and Chun-Qi Zhu",
  title =        "A fault-tolerant scheme for multistage interconnection
                 networks",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "13",
  number =       "3",
  pages =        "368--375",
  month =        jun,
  year =         "1985",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kumar:1985:DAF,
  author =       "V. P. Kumar and S. M. Reddy",
  title =        "Design and analysis of fault-tolerant multistage
                 interconnection networks with low link complexity",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "13",
  number =       "3",
  pages =        "376--386",
  month =        jun,
  year =         "1985",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Davis:1985:PAP,
  author =       "Nathaniel J. {Davis IV} and Howard Jay Siegel",
  title =        "The performance analysis of partitioned circuit
                 switched multistage interconnection networks",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "13",
  number =       "3",
  pages =        "387--394",
  month =        jun,
  year =         "1985",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Vrsalovic:1985:IPD,
  author =       "Dalibor Vrsalovic and Edward F. Gehringer and Zary Z.
                 Segall and Daniel P. Siewiorek",
  title =        "The influence of parallel decomposition strategies on
                 the performance of multiprocessor systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "13",
  number =       "3",
  pages =        "396--405",
  month =        jun,
  year =         "1985",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Abu-Sufah:1985:PPT,
  author =       "Walid Abu-Sufah and Alex Y. Kwok",
  title =        "Performance prediction tools for {Cedar}: a
                 multiprocessor supercomputer",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "13",
  number =       "3",
  pages =        "406--413",
  month =        jun,
  year =         "1985",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Grino:1985:ASM,
  author =       "Jos{\'e} M. Llaber{\'\i}a Gri{\~n}{\'o} and Mateo
                 Valero Cort{\'e}s and Enrique Herrada Lillo and
                 Jes{\'u}s Labarta Mancho",
  title =        "Analysis and simulation of multiplexed single-bus
                 networks with and without buffering",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "13",
  number =       "3",
  pages =        "414--421",
  month =        jun,
  year =         "1985",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Sanguinetti:1985:PMB,
  author =       "J. Sanguinetti and B. Kumar",
  title =        "Performance of a message-based multiprocessor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "13",
  number =       "3",
  pages =        "424--425",
  month =        jun,
  year =         "1985",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:54 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hake:1985:PDP,
  author =       "J.-Fr. Hake",
  title =        "{PDOC} --- a database on parallel processing
                 literature",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "13",
  number =       "4",
  pages =        "2--7",
  month =        sep,
  year =         "1985",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:10 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Rockey:1985:DAS,
  author =       "Mark Rockey",
  title =        "The dataflow architecture: a suitable base for the
                 implementation of expert systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "13",
  number =       "4",
  pages =        "8--14",
  month =        sep,
  year =         "1985",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:10 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Cragon:1985:ADS,
  author =       "Harvey G. Cragon",
  title =        "An architecture design system",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "13",
  number =       "4",
  pages =        "15--21",
  month =        sep,
  year =         "1985",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:10 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Huguet:1985:RRF,
  author =       "Miquel Huguet and Tom{\'a}s Lang",
  title =        "A reduced register file for {RISC} architectures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "13",
  number =       "4",
  pages =        "22--31",
  month =        sep,
  year =         "1985",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:10 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Alexander:1985:TBP,
  author =       "Cedell A. Alexander and William M. Keshlear and Faye
                 Briggs",
  title =        "Translation buffer performance in a {UNIX}
                 environment",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "13",
  number =       "5",
  pages =        "2--14",
  month =        dec,
  year =         "1985",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:18 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Lee:1985:HSC,
  author =       "Rosanna Lee",
  title =        "On ``hot spot'' contention",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "13",
  number =       "5",
  pages =        "15--20",
  month =        dec,
  year =         "1985",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:18 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Woo:1986:CHU,
  author =       "Nam Sung Woo and Richard O'Keefe",
  title =        "A comment on {``A hardware unification unit: design
                 and analysis''}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "14",
  number =       "1",
  pages =        "2--3",
  month =        jan,
  year =         "1986",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:29 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ruighaver:1986:DAD,
  author =       "A. B. Ruighaver",
  title =        "Design aspects of the {Delft Parallel Processor DPP84}
                 and its programming system",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "14",
  number =       "1",
  pages =        "4--8",
  month =        jan,
  year =         "1986",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:29 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hammerstrom:1986:CAP,
  author =       "Dan Hammerstrom and David Maier and Shreekant
                 Thakkar",
  title =        "The {Cognitive Architecture Project}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "14",
  number =       "1",
  pages =        "9--21",
  month =        jan,
  year =         "1986",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:29 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Smith:1986:BRC,
  author =       "Alan Jay Smith",
  title =        "Bibliography and reading on {CPU} cache memories and
                 related topics",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "14",
  number =       "1",
  pages =        "22--42",
  month =        jan,
  year =         "1986",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:29 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Yokota:1986:MAR,
  author =       "H. Yokota and H. Itoh",
  title =        "A model and an architecture for a relational knowledge
                 base",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "14",
  number =       "2",
  pages =        "2--9",
  month =        jun,
  year =         "1986",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Amamiya:1986:IEL,
  author =       "M. Amamiya and M. Takesue and R. Hasegawa and H.
                 Mikami",
  title =        "Implementation and evaluation of a
                 list-processing-oriented data flow machine",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "14",
  number =       "2",
  pages =        "10--19",
  month =        jun,
  year =         "1986",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Takahashi:1986:NSS,
  author =       "K. Takahashi and H. Yamada and H. Nagai and K.
                 Matsumi",
  title =        "A new string search hardware architecture for {VLSI}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "14",
  number =       "2",
  pages =        "20--27",
  month =        jun,
  year =         "1986",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Gupta:1986:PAA,
  author =       "A. Gupta and C. Forgy and A. Newell and R. Wedig",
  title =        "Parallel algorithms and architectures for rule-based
                 systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "14",
  number =       "2",
  pages =        "28--37",
  month =        jun,
  year =         "1986",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Halstead:1986:CDM,
  author =       "R. R. {Halstead, Jr.} and T. L. Anderson and R. B.
                 Osborne and T. L. Sterling",
  title =        "{Concert}: design of a multiprocessor development
                 system",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "14",
  number =       "2",
  pages =        "40--48",
  month =        jun,
  year =         "1986",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kung:1986:MRB,
  author =       "H. T. Kung",
  title =        "Memory requirements for balanced computer
                 architectures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "14",
  number =       "2",
  pages =        "49--54",
  month =        jun,
  year =         "1986",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hong:1986:GAS,
  author =       "Y. C. Hong and T. H. Payne and L. B. O. Ferguson",
  title =        "Graph allocation in static dataflow systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "14",
  number =       "2",
  pages =        "55--64",
  month =        jun,
  year =         "1986",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Agrawal:1986:SIR,
  author =       "P. Agrawal and R. Agrawal",
  title =        "Software implementation of a recursive fault tolerance
                 algorithm on a network of computers",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "14",
  number =       "2",
  pages =        "65--72",
  month =        jun,
  year =         "1986",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Nojiri:1986:MPO,
  author =       "T. Nojiri and S. Kawasaki and K. Sakoda",
  title =        "Microprogrammable processor for object-oriented
                 architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "14",
  number =       "2",
  pages =        "74--81",
  month =        jun,
  year =         "1986",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Thakkar:1986:IFU,
  author =       "S. S. Thakkar and W. E. Hostmann",
  title =        "An instruction fetch unit for a graph reduction
                 machine",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "14",
  number =       "2",
  pages =        "82--91",
  month =        jun,
  year =         "1986",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Gehringer:1986:FOO,
  author =       "E. F. Gehringer and R. P. Colwell",
  title =        "Fast object-oriented procedure calls: lessons from the
                 {Intel 432}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "14",
  number =       "2",
  pages =        "92--101",
  month =        jun,
  year =         "1986",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Dias:1986:CMS,
  author =       "D. M. Dias and B. R. Iyer and P. S. Yu",
  title =        "On coupling many small systems for transaction
                 processing",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "14",
  number =       "2",
  pages =        "104--110",
  month =        jun,
  year =         "1986",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Malkawi:1986:PMP,
  author =       "M. I. Malkawi and J. H. Patel",
  title =        "Performance measurement of paging behavior in
                 multiprogramming systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "14",
  number =       "2",
  pages =        "111--118",
  month =        jun,
  year =         "1986",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Agarwal:1986:ANT,
  author =       "A. Agarwal and R. L. Sites and M. Horowitz",
  title =        "{ATUM}: a new technique for capturing address traces
                 using microcode",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "14",
  number =       "2",
  pages =        "119--127",
  month =        jun,
  year =         "1986",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Wise:1986:EES,
  author =       "M. J. Wise",
  title =        "Experimenting with {EPILOG}: some results and
                 preliminary conclusions",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "14",
  number =       "2",
  pages =        "119--127",
  month =        jun,
  year =         "1986",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Shobatake:1986:UPB,
  author =       "Y. Shobatake and H. Aiso",
  title =        "A unification processor based on a uniformly
                 structured cellular hardware",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "14",
  number =       "2",
  pages =        "128--139",
  month =        jun,
  year =         "1986",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ito:1986:APE,
  author =       "N. Ito and M. Sato and E. Kuno and K. Rokusawa",
  title =        "The architecture and preliminary evaluation results of
                 the experimental parallel inference machine {PIM-D}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "14",
  number =       "2",
  pages =        "149--156",
  month =        jun,
  year =         "1986",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Seznec:1986:ERC,
  author =       "A. Seznec",
  title =        "An efficient routing control for the {SIGMA} network
                 {$ \Sigma (4) $}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "14",
  number =       "2",
  pages =        "158--168",
  month =        jun,
  year =         "1986",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Nicoud:1986:RHP,
  author =       "J. D. Nicoud and K. Skala",
  title =        "{REYSM}, a high performance, low power multi-processor
                 bus",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "14",
  number =       "2",
  pages =        "169--174",
  month =        jun,
  year =         "1986",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Lee:1986:ESG,
  author =       "K. Y. Lee and W. Hegazy",
  title =        "The extra stage gamma network",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "14",
  number =       "2",
  pages =        "175--182",
  month =        jun,
  year =         "1986",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Yuhara:1986:EFA,
  author =       "M. Yuhara and A. Hattori and M. Niwa and M. Kishimoto
                 and H. Hayashi",
  title =        "Evaluation of the {FACOM ALPHA Lisp} machine",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "14",
  number =       "2",
  pages =        "184--190",
  month =        jun,
  year =         "1986",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Pleszkun:1986:AEL,
  author =       "A. R. Pleszkun and M. J. Thazhuthaveetil",
  title =        "An architecture for efficient {Lisp} list access",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "14",
  number =       "2",
  pages =        "191--198",
  month =        jun,
  year =         "1986",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Nakata:1986:FLS,
  author =       "T. Nakata and N. Koike",
  title =        "A functional level simulation engine of {MAN-YO}: a
                 special purpose parallel machine for logic design
                 automation",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "14",
  number =       "2",
  pages =        "202--208",
  month =        jun,
  year =         "1986",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Frank:1986:EPS,
  author =       "E. H. Frank",
  title =        "Exploiting parallelism in a switch-level simulation
                 machine",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "14",
  number =       "2",
  pages =        "209--215",
  month =        jun,
  year =         "1986",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Anantharaman:1986:HAS,
  author =       "T. S. Anantharaman and R. Bisiani",
  title =        "A hardware accelerator for speech recognition
                 algorithms",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "14",
  number =       "2",
  pages =        "216--223",
  month =        jun,
  year =         "1986",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Shimada:1986:EPD,
  author =       "T. Shimada and K. Hiraki and K. Nishida and S.
                 Sekiguchi",
  title =        "Evaluation of a prototype data flow processor of the
                 {SIGMA-1} for scientific computations",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "14",
  number =       "2",
  pages =        "226--234",
  month =        jun,
  year =         "1986",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Sargeant:1986:SDS,
  author =       "J. Sargeant and C. C. Kirkham",
  title =        "Stored data structures on the {Manchester} dataflow
                 machine",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "14",
  number =       "2",
  pages =        "235--242",
  month =        jun,
  year =         "1986",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hawakami:1986:SDS,
  author =       "K. Hawakami and J. R. Gurd",
  title =        "A scalable dataflow structure store",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "14",
  number =       "2",
  pages =        "243--250",
  month =        jun,
  year =         "1986",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hasegawa:1986:FFT,
  author =       "M. Hasegawa and Y. Shigei",
  title =        "{$ A T^2 = O(N \log^4 N), T = O(\log N) $} {Fast
                 Fourier Transform} in a light connected $3$-dimensional
                 {VLSI}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "14",
  number =       "2",
  pages =        "252--260",
  month =        jun,
  year =         "1986",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Sapiecha:1986:MAH,
  author =       "K. Sapiecha and R. Jarocki",
  title =        "Modular architecture for high performance
                 implementation of {FFT} algorithm",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "14",
  number =       "2",
  pages =        "261--270",
  month =        jun,
  year =         "1986",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Navarro:1986:CSI,
  author =       "J. J. Navarro and J. M. Llaberia and M. Valero",
  title =        "Computing size-independent matrix problems on systolic
                 array processors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "14",
  number =       "2",
  pages =        "271--278",
  month =        jun,
  year =         "1986",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Tomita:1986:CLL,
  author =       "S. Tomita and K. Shibayama and T. Nakata and S. Yuasa
                 and H. Hagiwara",
  title =        "A computer with low-level parallelism {QA-2}: its
                 applications to {$3$-D} graphics and {Prolog\slash
                 Lisp} machines",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "14",
  number =       "2",
  pages =        "280--289",
  month =        jun,
  year =         "1986",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hirayama:1986:VOA,
  author =       "M. Hirayama",
  title =        "{VLSI} oriented asynchronous architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "14",
  number =       "2",
  pages =        "290--296",
  month =        jun,
  year =         "1986",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hwu:1986:HHP,
  author =       "W. Hwu and Y. N. Patt",
  title =        "{HPSm}, a high performance restricted data flow
                 architecture having minimal functionality",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "14",
  number =       "2",
  pages =        "297--306",
  month =        jun,
  year =         "1986",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Onaga:1986:DRA,
  author =       "K. Onaga and T. Takechi",
  title =        "On design of rotary array communication and
                 wavefront-driven algorithms for solving large-scale
                 band-limited matrix equations",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "14",
  number =       "2",
  pages =        "308--315",
  month =        jun,
  year =         "1986",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Napolitano:1986:CAD,
  author =       "L. M. {Napolitano, Jr.}",
  title =        "A computer architecture for dynamic finite element
                 analysis",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "14",
  number =       "2",
  pages =        "316--323",
  month =        jun,
  year =         "1986",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Harper:1986:PEV,
  author =       "D. T. {Harper III} and J. R. Jump",
  title =        "Performance evaluation of vector accesses in parallel
                 memories using a skewed storage scheme",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "14",
  number =       "2",
  pages =        "324--328",
  month =        jun,
  year =         "1986",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kondo:1986:PMA,
  author =       "T. Kondo and T. Tsuchiya and T. Kitamura and Y.
                 Sugiyama and T. Kimura",
  title =        "Pseudo {MIMD} array processor---{AAP2}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "14",
  number =       "2",
  pages =        "330--337",
  month =        jun,
  year =         "1986",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Fisher:1986:SLA,
  author =       "A. L. Fisher",
  title =        "Scan line array processors for image computation",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "14",
  number =       "2",
  pages =        "338--345",
  month =        jun,
  year =         "1986",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Annaratone:1986:WAI,
  author =       "M. Annaratone and E. Arnould and T. Gross and H. T.
                 Kung and M. S. Lam",
  title =        "{Warp} architecture and implementation",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "14",
  number =       "2",
  pages =        "346--356",
  month =        jun,
  year =         "1986",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Wood:1986:CAT,
  author =       "D. A. Wood and S. J. Eggers and G. Gibson and M. D.
                 Hill and J. M. Pendleton",
  title =        "An in-cache address translation mechanism",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "14",
  number =       "2",
  pages =        "358--365",
  month =        jun,
  year =         "1986",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Cheriton:1986:SCC,
  author =       "D. R. Cheriton and G. A. Slavenburg and P. D. Boyle",
  title =        "Software-controlled caches in the {VMP}
                 multiprocessor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "14",
  number =       "2",
  pages =        "366--374",
  month =        jun,
  year =         "1986",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Goodman:1986:URV,
  author =       "J. R. Goodman and W. C. Hsu",
  title =        "On the use of registers vs. cache to minimize memory
                 traffic",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "14",
  number =       "2",
  pages =        "375--383",
  month =        jun,
  year =         "1986",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hsu:1986:HCS,
  author =       "P. Y. T. Hsu and E. S. Davidson",
  title =        "Highly concurrent scalar processing",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "14",
  number =       "2",
  pages =        "386--395",
  month =        jun,
  year =         "1986",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{McFarling:1986:RCB,
  author =       "S. McFarling and J. Hennesey",
  title =        "Reducing the cost of branches",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "14",
  number =       "2",
  pages =        "396--403",
  month =        jun,
  year =         "1986",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kunkel:1986:OPS,
  author =       "S. R. Kunkel and J. E. Smith",
  title =        "Optimal pipelining in supercomputers",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "14",
  number =       "2",
  pages =        "404--411",
  month =        jun,
  year =         "1986",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Sweazey:1986:CCC,
  author =       "P. Sweazey and A. J. Smith",
  title =        "A class of compatible cache consistency protocols and
                 their support by the {IEEE Futurebus}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "14",
  number =       "2",
  pages =        "414--423",
  month =        jun,
  year =         "1986",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Bitar:1986:MCS,
  author =       "P. Bitar and A. M. Despain",
  title =        "Multiprocessor cache synchronization: issues,
                 innovations, evolution",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "14",
  number =       "2",
  pages =        "424--433",
  month =        jun,
  year =         "1986",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Dubois:1986:MAB,
  author =       "M. Dubois and C. Scheurich and F. Briggs",
  title =        "Memory access buffering in multiprocessors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "14",
  number =       "2",
  pages =        "434--442",
  month =        jun,
  year =         "1986",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Taylor:1986:ESL,
  author =       "G. S. Taylor and P. N. Hilfinger and J. R. Larus and
                 D. A. Patterson and B. G. Zorn",
  title =        "Evaluation of the {SPUR Lisp} architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "14",
  number =       "2",
  pages =        "444--452",
  month =        jun,
  year =         "1986",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Woo:1986:RCC,
  author =       "Nam Sung Woo",
  title =        "A reply to comments {``A Comment on 'A Hardware
                 Unification Unit: Design and Analysis''\,'}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "14",
  number =       "3",
  pages =        "2--4",
  month =        jun,
  year =         "1986",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:55 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{DuBose:1986:MR,
  author =       "D. K. DuBose and D. K. Fotakis and D. Tabak",
  title =        "A microcoded {RISC}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "14",
  number =       "3",
  pages =        "5--16",
  month =        jun,
  year =         "1986",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:55 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Lang:1986:RRS,
  author =       "Tom{\'a}s Lang and Miquel Huguet",
  title =        "Reduced register saving\slash restoring in
                 single-window register files",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "14",
  number =       "3",
  pages =        "17--26",
  month =        jun,
  year =         "1986",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:55 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Rouse:1986:TDH,
  author =       "Larry O'Neal Rouse",
  title =        "The twisted double helix: a minimum distance
                 architecture for 5th generation computing",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "14",
  number =       "3",
  pages =        "27--33",
  month =        jun,
  year =         "1986",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:55 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Harland:1986:RMT,
  author =       "David M. Harland",
  title =        "A recursively microcodable tagged architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "14",
  number =       "3",
  pages =        "34--40",
  month =        jun,
  year =         "1986",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:55 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Alexander:1986:CMP,
  author =       "Cedell Alexander and William Keshlear and Furrokh
                 Cooper and Faye Briggs",
  title =        "Cache memory performance in a {Unix} environment",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "14",
  number =       "3",
  pages =        "41--61",
  month =        jun,
  year =         "1986",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:55 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Stokes:1986:THV,
  author =       "Roger Stokes",
  title =        "Traces for hardware verification",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "14",
  number =       "4",
  pages =        "7--14",
  month =        sep,
  year =         "1986",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:11 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kirner:1986:DDS,
  author =       "Claudio Kirner and Eduardo Marques",
  title =        "Design of a distributed system support based on a
                 centralized parallel bus",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "14",
  number =       "4",
  pages =        "15--26",
  month =        sep,
  year =         "1986",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:11 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Irwin:1986:STR,
  author =       "Mary Jane Irwin",
  title =        "Secretary\slash Treasurer's {Report}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "14",
  number =       "4",
  pages =        "28--28",
  month =        sep,
  year =         "1986",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:11 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Harland:1986:MOO,
  author =       "David M. Harland and Bruno Beloff",
  title =        "Microcoding an object-oriented instruction set",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "14",
  number =       "5",
  pages =        "3--12",
  month =        dec,
  year =         "1986",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:18 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Stallings:1986:ABR,
  author =       "William Stallings",
  title =        "An annotated bibliography on reduced instruction set
                 computers",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "14",
  number =       "5",
  pages =        "13--19",
  month =        dec,
  year =         "1986",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:18 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Halstead:1987:OCM,
  author =       "Robert H. {Halstead, Jr.}",
  title =        "Overview of {Concert MultiLisp}: a multiprocessor
                 symbolic computing system",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "15",
  number =       "1",
  pages =        "5--14",
  month =        mar,
  year =         "1987",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:31 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Patterson:1987:PRS,
  author =       "Dave Patterson",
  title =        "A progress report on {SPUR}: {February 1, 1987}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "15",
  number =       "1",
  pages =        "15--21",
  month =        mar,
  year =         "1987",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:31 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Despain:1987:A,
  author =       "A. Despain and Y. Patt and V. Srini and P. Bitar and
                 W. Bush and C. Chien and W. Citrin and B. Fagin and W.
                 Hwu and S. Melvin and R. McGeer and A. Singhal and M.
                 Shebanow and P. {Van Roy}",
  title =        "Aquarius",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "15",
  number =       "1",
  pages =        "22--34",
  month =        mar,
  year =         "1987",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:31 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kohli:1987:OPP,
  author =       "Madhur Kohli and Mark E. Giuliano and Jack Minker",
  title =        "An overview of the {PRISM} project",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "15",
  number =       "1",
  pages =        "35--42",
  month =        mar,
  year =         "1987",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:31 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hermenegildo:1987:DHP,
  author =       "M. V. Hermenegildo and R. A. Warren",
  title =        "Designing a high performance parallel logic
                 programming system",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "15",
  number =       "1",
  pages =        "43--52",
  month =        mar,
  year =         "1987",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:31 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Mills:1987:CGR,
  author =       "Jonathan W. Mills",
  title =        "Coming to grips with a {RISC}: a report of the
                 progress of the {LOW RISC} design group",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "15",
  number =       "1",
  pages =        "53--62",
  month =        mar,
  year =         "1987",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:31 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Short:1987:UIS,
  author =       "Brian Short",
  title =        "Use of instruction set simulators to evaluate the {LOW
                 RISC}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "15",
  number =       "1",
  pages =        "63--67",
  month =        mar,
  year =         "1987",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:31 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Gutzmann:1987:ODH,
  author =       "Kurt M. Gutzmann",
  title =        "Optimal dimension of hypercubes for sorting",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "15",
  number =       "1",
  pages =        "68--72",
  month =        mar,
  year =         "1987",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:31 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Chesley:1987:AWN,
  author =       "Gilman Chesley",
  title =        "Addressable {WSI}: a non-redundant approach",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "15",
  number =       "1",
  pages =        "73--80",
  month =        mar,
  year =         "1987",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:31 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Biswas:1987:CCS,
  author =       "Nripendra N. Biswas and S. Srinivas and Trishala
                 Dharanendra",
  title =        "A centrally controlled shuffle network for
                 reconfigurable and fault-tolerant architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "15",
  number =       "1",
  pages =        "81--87",
  month =        mar,
  year =         "1987",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:31 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ditzel:1987:BFC,
  author =       "D. R. Ditzel and H. R. McLellan",
  title =        "Branch folding in the {CRISP} microprocessor: reducing
                 branch delay to zero",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "15",
  number =       "2",
  pages =        "2--8",
  year =         "1987",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 16:49:40 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{DeRosa:1987:EBA,
  author =       "J. A. DeRosa and H. M. Levy",
  title =        "An evaluation of branch architectures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "15",
  number =       "2",
  pages =        "10--16",
  year =         "1987",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 16:49:40 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hwu:1987:CRO,
  author =       "W. W. Hwu and Y. N. Patt",
  title =        "Checkpoint repair for out-of-order execution
                 machines",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "15",
  number =       "2",
  pages =        "18--26",
  year =         "1987",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 16:49:40 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Sohi:1987:IIL,
  author =       "G. S. Sohi and S. Vajapeyam",
  title =        "Instruction issue logic for high-performance,
                 interruptible pipelined processors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "15",
  number =       "2",
  pages =        "27--34",
  year =         "1987",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 16:49:40 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Swensen:1987:FTS,
  author =       "J. Swensen and Y. Patt",
  title =        "Fast temporary storage for serial and parallel
                 execution",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "15",
  number =       "2",
  pages =        "35--43",
  year =         "1987",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 16:49:40 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Wong:1987:PAD,
  author =       "K. Wong and M. A. Franklin",
  title =        "Performance analysis and design of a logic simulation
                 machine",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "15",
  number =       "2",
  pages =        "46--55",
  year =         "1987",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 16:49:40 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Doshi:1987:MSA,
  author =       "K. Doshi and P. Varman",
  title =        "A modular systolic architecture for image
                 convolutions",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "15",
  number =       "2",
  pages =        "56--63",
  year =         "1987",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 16:49:40 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Fujita:1987:TMA,
  author =       "S. Fujita and R. Aibara and M. Yamashita and T. Ae",
  title =        "A template matching algorithm using
                 optically-connected {$3$-D} {VLSI} architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "15",
  number =       "2",
  pages =        "64--70",
  year =         "1987",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 16:49:40 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Mendelson:1987:MDF,
  author =       "B. Mendelson and G. M. Silberman",
  title =        "Mapping data flow programs on a {VLSI} array of
                 processors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "15",
  number =       "2",
  pages =        "72--80",
  year =         "1987",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 16:49:40 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ghosal:1987:AMA,
  author =       "D. Ghosal and L. N. Bhuyan",
  title =        "Analytical modeling and architectural modifications of
                 a dataflow computer",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "15",
  number =       "2",
  pages =        "81--89",
  year =         "1987",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 16:49:40 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Takesue:1987:URM,
  author =       "M. Takesue",
  title =        "A unified resource management and execution control
                 mechanism for data flow machines",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "15",
  number =       "2",
  pages =        "90--97",
  year =         "1987",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 16:49:40 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Abe:1987:HPI,
  author =       "S. Abe and T. Bandoh and S. Yamaguchi and K. Kurosawa
                 and K. Kiriyama",
  title =        "High performance integrated {Prolog} processor {IPP}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "15",
  number =       "2",
  pages =        "100--107",
  year =         "1987",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 16:49:40 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Fagin:1987:PSP,
  author =       "B. S. Fagin and A. M. Despain",
  title =        "Performance studies of a parallel {Prolog}
                 architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "15",
  number =       "2",
  pages =        "108--116",
  year =         "1987",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 16:49:40 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Civera:1987:EVP,
  author =       "P. L. Civera and F. Maddaleno and G. L. Piccinini and
                 M. Zamboni",
  title =        "An experimental {VLSI} {Prolog} interpreter:
                 preliminary measurements and results",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "15",
  number =       "2",
  pages =        "117--126",
  year =         "1987",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 16:49:40 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ridoux:1987:DSM,
  author =       "O. Ridoux",
  title =        "Deterministic and stochastic modeling of parallel
                 garbage collection: towards real-time criteria",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "15",
  number =       "2",
  pages =        "128--136",
  year =         "1987",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 16:49:40 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Sun:1987:SEP,
  author =       "C. Sun and Y. Tsu",
  title =        "The sharing of environment in {AND--OR}-parallel
                 execution of logic programs",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "15",
  number =       "2",
  pages =        "137--144",
  year =         "1987",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 16:49:40 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Guha:1987:AID,
  author =       "A. Guha and R. Ramnarayan and M. Derstine",
  title =        "Architectural issues in designing symbolic processors
                 in optics",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "15",
  number =       "2",
  pages =        "145--151",
  year =         "1987",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 16:49:40 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Varma:1987:RMS,
  author =       "A. Varma and C. S. Raghavendra",
  title =        "Rearrangeability of multistage shuffle\slash exchange
                 networks",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "15",
  number =       "2",
  pages =        "154--162",
  year =         "1987",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 16:49:40 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Beivide:1987:OMC,
  author =       "R. Beivide and E. Herrada and J. L. Balcazar and J.
                 Labarta",
  title =        "Optimized mesh-connected networks for {SIMD} and
                 {MIMD} architectures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "15",
  number =       "2",
  pages =        "163--170",
  year =         "1987",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 16:49:40 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Harper:1987:PER,
  author =       "D. T. {Harper III} and J. R. Jump",
  title =        "Performance evaluation of reduced bandwidth multistage
                 interconnection networks",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "15",
  number =       "2",
  pages =        "171--175",
  year =         "1987",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 16:49:40 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ramachandran:1987:HSI,
  author =       "U. Ramachandran and M. Solomon and M. Vernon",
  title =        "Hardware support for interprocess communication",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "15",
  number =       "2",
  pages =        "178--188",
  year =         "1987",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 16:49:40 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Dally:1987:AMD,
  author =       "W. J. Dally and L. Chao and A. Chien and S. Hassoun
                 and W. Horwat and J. Kaplan and P. Song and B. Totty
                 and S. Wills",
  title =        "Architecture of a message-driven processor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "15",
  number =       "2",
  pages =        "189--196",
  year =         "1987",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 16:49:40 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kumar:1987:ESA,
  author =       "M. Kumar",
  title =        "Effect of storage allocation\slash reclamation methods
                 on parallelism and storage requirements",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "15",
  number =       "2",
  pages =        "197--205",
  year =         "1987",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 16:49:40 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Chang:1987:CDS,
  author =       "J. H. Chang and H. Chao and K. So",
  title =        "Cache design of a sub-micron {CMOS} {System\slash
                 370}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "15",
  number =       "2",
  pages =        "208--213",
  year =         "1987",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 16:49:40 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Freeman:1987:APM,
  author =       "M. Freeman",
  title =        "An architectural perspective on a memory access
                 controller",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "15",
  number =       "2",
  pages =        "214--223",
  year =         "1987",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 16:49:40 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Cheung:1987:OAG,
  author =       "K. Cheung and G. Sohi and K. Saluja and D. Pradhan",
  title =        "Organization and analysis of a gracefully-degrading
                 interleaved memory system",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "15",
  number =       "2",
  pages =        "224--231",
  year =         "1987",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 16:49:40 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Scheurich:1987:CMO,
  author =       "C. Scheurich and M. Dubois",
  title =        "Correct memory operation of cache-based
                 multiprocessors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "15",
  number =       "2",
  pages =        "234--243",
  year =         "1987",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 16:49:40 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Wilson:1987:HCB,
  author =       "A. W. {Wilson, Jr.}",
  title =        "Hierarchical cache\slash bus architecture for shared
                 memory multiprocessors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "15",
  number =       "2",
  pages =        "244--252",
  year =         "1987",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 16:49:40 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Lee:1987:MCD,
  author =       "R. L. Lee and P. C. Yew and D. H. Lawrie",
  title =        "Multiprocessor cache design considerations",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "15",
  number =       "2",
  pages =        "253--262",
  year =         "1987",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 16:49:40 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Eickemeyer:1987:PEM,
  author =       "R. J. Eickemeyer and J. H. Patel",
  title =        "Performance evaluation of multiple register sets",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "15",
  number =       "2",
  pages =        "264--271",
  year =         "1987",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 16:49:40 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Stanley:1987:PAA,
  author =       "T. J. Stanley and R. G. Wedig",
  title =        "A performance analysis of automatically managed top of
                 stack buffers",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "15",
  number =       "2",
  pages =        "272--281",
  year =         "1987",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 16:49:40 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Moore:1987:CSV,
  author =       "B. Moore and A. Padegs and R. Smith and W. Buchholz",
  title =        "Concepts of the {System\slash 370} vector
                 architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "15",
  number =       "2",
  pages =        "282--288",
  year =         "1987",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 16:49:40 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Pleszkun:1987:WRA,
  author =       "A. R. Pleszkun and J. R. Goodman and W. C. Hsu and R.
                 T. Joersz and G. Bier and P. Woest and P. B.
                 Schechter",
  title =        "{WISQ}: a restartable architecture using queues",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "15",
  number =       "2",
  pages =        "290--299",
  year =         "1987",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 16:49:40 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Chow:1987:ATD,
  author =       "P. Chow and M. Horowitz",
  title =        "Architectural tradeoffs in the design of {MIPS-X}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "15",
  number =       "2",
  pages =        "300--308",
  year =         "1987",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 16:49:40 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ditzel:1987:HAC,
  author =       "D. R. Ditzel and H. R. McLellan and A. D. Berenbaum",
  title =        "The hardware architecture of the {CRISP}
                 microprocessor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "15",
  number =       "2",
  pages =        "309--319",
  year =         "1987",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 16:49:40 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Moore:1987:BDN,
  author =       "Matthew Moore and Charles McDowell",
  title =        "Bi-directional networks for large parallel
                 processors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "15",
  number =       "3",
  pages =        "3--4",
  month =        jun,
  year =         "1987",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:53 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kaplan:1987:LLG,
  author =       "Ian Kaplan",
  title =        "The {LDF 100}: a large grain dataflow parallel
                 processor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "15",
  number =       "3",
  pages =        "5--12",
  month =        jun,
  year =         "1987",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:53 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Lass:1987:WCC,
  author =       "Stanley Lass",
  title =        "Wide channel computers",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "15",
  number =       "3",
  pages =        "13--16",
  month =        jun,
  year =         "1987",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:53 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Bril:1987:IIA,
  author =       "Reinder J. Bril",
  title =        "An implementation independent approach to cache
                 memories",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "15",
  number =       "3",
  pages =        "17--24",
  month =        jun,
  year =         "1987",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:53 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Bril:1987:CLV,
  author =       "Reinder J. Bril",
  title =        "On cacheability of lock-variables in tightly coupled
                 multiprocessor systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "15",
  number =       "3",
  pages =        "25--32",
  month =        jun,
  year =         "1987",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:53 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Iliffe:1987:FLM,
  author =       "J. K. Iliffe",
  title =        "A forward-looking method of {Cache} memory control",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "15",
  number =       "4",
  pages =        "4--10",
  month =        sep,
  year =         "1987",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:11 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Bandyopadhyay:1987:CBM,
  author =       "Amitava Bandyopadhyay and Yuan F. Zheng",
  title =        "Combining both microcode and hardwired control in
                 {RISC}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "15",
  number =       "4",
  pages =        "11--15",
  month =        sep,
  year =         "1987",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:11 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Dowd:1987:ERV,
  author =       "Martin Dowd",
  title =        "An example {RISC} vector machine architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "15",
  number =       "4",
  pages =        "16--22",
  month =        sep,
  year =         "1987",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:11 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Bhatia:1987:MIN,
  author =       "Sanjiv K. Bhatia and A. G. Starling",
  title =        "Multilayered {Illiac} network scheme",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "15",
  number =       "4",
  pages =        "23--31",
  month =        sep,
  year =         "1987",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:11 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Nowak:1987:SGP,
  author =       "Lothar Nowak",
  title =        "{SAMP:a} general purpose processor based on a
                 self-timed {VLIW} structure",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "15",
  number =       "4",
  pages =        "32--39",
  month =        sep,
  year =         "1987",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:11 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ashenden:1987:LWP,
  author =       "Peter J. Ashenden and Chris J. Barter and Chris D.
                 Marlin",
  title =        "The {Leopard} workstation project",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "15",
  number =       "4",
  pages =        "40--51",
  month =        sep,
  year =         "1987",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:11 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Chiang:1987:DEL,
  author =       "Y. P. Chiang and M. L. Manwaring",
  title =        "Direct execution {Lisp} and cell memory",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "15",
  number =       "4",
  pages =        "52--57",
  month =        sep,
  year =         "1987",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:11 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Terry:1987:FCM,
  author =       "J. M. Terry",
  title =        "Flow-control machines:the structured execution
                 architecture {(SXA)}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "15",
  number =       "4",
  pages =        "58--69",
  month =        sep,
  year =         "1987",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:11 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Wirth:1987:HAP,
  author =       "Niklaus Wirth",
  title =        "Hardware architectures for programming languages and
                 programming languages for hardware architectures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "15",
  number =       "5",
  pages =        "2--8",
  month =        oct,
  year =         "1987",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:25 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Beck:1987:VAM,
  author =       "Bob Beck and Bob Kasten and Shreekant Thakkar",
  title =        "{VLSI} assist for a multiprocessor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "15",
  number =       "5",
  pages =        "10--20",
  month =        oct,
  year =         "1987",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:25 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Bisiani:1987:ASM,
  author =       "Roberto Bisiani and Alessandro Forin",
  title =        "Architectural support for multilanguage parallel
                 programming on heterogeneous systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "15",
  number =       "5",
  pages =        "21--30",
  month =        oct,
  year =         "1987",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:25 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Rashid:1987:MIV,
  author =       "Richard Rashid and Avadis Tevanian and Michael Young
                 and David Golub and Robert Baron",
  title =        "Machine-independent virtual memory management for
                 paged uniprocessor and multiprocessor architectures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "15",
  number =       "5",
  pages =        "31--39",
  month =        oct,
  year =         "1987",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:25 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hayes:1987:ADE,
  author =       "John R. Hayes and Martin E. Fraeman and Robert L.
                 Williams and Thomas Zaremba",
  title =        "An architecture for the direct execution of the
                 {Forth} programming language",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "15",
  number =       "5",
  pages =        "42--49",
  month =        oct,
  year =         "1987",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:25 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Steenkiste:1987:TTC,
  author =       "Peter Steenkiste and John Hennessy",
  title =        "Tags and type checking in {LISP}: hardware and
                 software approaches",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "15",
  number =       "5",
  pages =        "50--59",
  month =        oct,
  year =         "1987",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:25 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Davidson:1987:EIS,
  author =       "Jack W. Davidson and Richard A. Vaughan",
  title =        "The effect of instruction set complexity on program
                 size and memory performance",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "15",
  number =       "5",
  pages =        "60--64",
  month =        oct,
  year =         "1987",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:25 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Atkinson:1987:DP,
  author =       "Russell R. Atkinson and Edward M. McCreight",
  title =        "The dragon processor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "15",
  number =       "5",
  pages =        "65--69",
  month =        oct,
  year =         "1987",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:25 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Goodman:1987:CMV,
  author =       "James R. Goodman",
  title =        "Coherency for multiprocessor virtual address caches",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "15",
  number =       "5",
  pages =        "72--81",
  month =        oct,
  year =         "1987",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:25 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Cargill:1987:CHS,
  author =       "T. A. Cargill and B. N. Locanthi",
  title =        "Cheap hardware support for software debugging and
                 profiling",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "15",
  number =       "5",
  pages =        "82--83",
  month =        oct,
  year =         "1987",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:25 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Georgiou:1987:ECI,
  author =       "C. J. Georgiou and S. L. Palmer and P. L. Rosenfeld",
  title =        "An experimental coprocessor for implementing
                 persistent objects on an {IBM 4381}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "15",
  number =       "5",
  pages =        "84--87",
  month =        oct,
  year =         "1987",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:25 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Magenheimer:1987:IMD,
  author =       "Daniel J. Magenheimer and Liz Peters and Karl Pettis
                 and Dan Zuras",
  title =        "Integer multiplication and division on the {HP}
                 precision architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "15",
  number =       "5",
  pages =        "90--99",
  month =        oct,
  year =         "1987",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:25 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Wall:1987:MEU,
  author =       "David W. Wall and Michael L. Powell",
  title =        "The {Mahler} experience: using an intermediate
                 language as the machine description",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "15",
  number =       "5",
  pages =        "100--104",
  month =        oct,
  year =         "1987",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:25 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Weiss:1987:SSC,
  author =       "Shlomo Weiss and James E. Smith",
  title =        "A study of scalar compilation techniques for pipelined
                 supercomputers",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "15",
  number =       "5",
  pages =        "105--109",
  month =        oct,
  year =         "1987",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:25 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Bush:1987:CSR,
  author =       "William R. Bush and A. Dain Samples and David Ungar
                 and Paul N. Hilfinger",
  title =        "Compiling {Smalltalk-80} to a {RISC}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "15",
  number =       "5",
  pages =        "112--116",
  month =        oct,
  year =         "1987",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:25 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Chow:1987:HMA,
  author =       "F. Chow and S. Correll and M. Himelstein and E.
                 Killian and L. Weber",
  title =        "How many addressing modes are enough?",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "15",
  number =       "5",
  pages =        "117--121",
  month =        oct,
  year =         "1987",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:25 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Massalin:1987:SLS,
  author =       "Henry Massalin",
  title =        "{Superoptimizer}: a look at the smallest program",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "15",
  number =       "5",
  pages =        "122--126",
  month =        oct,
  year =         "1987",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:25 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Taki:1987:PAE,
  author =       "Kazuo Taki and Katzuto Nakajima and Hiroshi Nakashima
                 and Morihiro Ikeda",
  title =        "Performance and architectural evaluation of the {PSI}
                 machine",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "15",
  number =       "5",
  pages =        "128--135",
  month =        oct,
  year =         "1987",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:25 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Borriello:1987:RVC,
  author =       "Gaetano Borriello and Andrew R. Cherenson and Peter B.
                 Danzig and Michael N. Nelson",
  title =        "{RISCs} vs. {CISCs} for {Prolog}: a case study",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "15",
  number =       "5",
  pages =        "136--145",
  month =        oct,
  year =         "1987",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:25 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kieburtz:1987:RAS,
  author =       "Richard B. Kieburtz",
  title =        "A {RISC} architecture for symbolic computation",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "15",
  number =       "5",
  pages =        "146--155",
  month =        oct,
  year =         "1987",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:25 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ditzel:1987:DTS,
  author =       "David R. Ditzel and Hubert R. McLellan and Alan D.
                 Berenbaum",
  title =        "Design tradeoffs to support the {C} programming
                 language in the {CRISP} microprocessor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "15",
  number =       "5",
  pages =        "158--163",
  month =        oct,
  year =         "1987",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:25 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Thacker:1987:FMW,
  author =       "Charles P. Thacker and Lawrence C. Stewart",
  title =        "{Firefly}: a multiprocessor workstation",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "15",
  number =       "5",
  pages =        "164--172",
  month =        oct,
  year =         "1987",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:25 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Clark:1987:PPV,
  author =       "Douglas W. Clark",
  title =        "Pipelining and performance in the {VAX 8800}
                 processor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "15",
  number =       "5",
  pages =        "173--177",
  month =        oct,
  year =         "1987",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:25 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Colwell:1987:VAT,
  author =       "Robert P. Colwell and Robert P. Nix and John J.
                 O'Donnell and David B. Papworth and Paul K. Rodman",
  title =        "A {VLIW} architecture for a trace scheduling
                 compiler",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "15",
  number =       "5",
  pages =        "180--192",
  month =        oct,
  year =         "1987",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:25 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Levinthal:1987:PCG,
  author =       "Adam Levinthal and Pat Hanrahan and Mike Paquette and
                 Jim Lawson",
  title =        "Parallel computers for graphics applications",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "15",
  number =       "5",
  pages =        "193--198",
  month =        oct,
  year =         "1987",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:25 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Smith:1987:ZCP,
  author =       "J. E. Smith and G. E. Dermer and B. D. Vanderwarn and
                 S. D. Klinger and C. M. Rozewski",
  title =        "The {ZS-1} central processor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "15",
  number =       "5",
  pages =        "199--204",
  month =        oct,
  year =         "1987",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:25 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Frietman:1987:EOD,
  author =       "E. E. E. Frietman and A. B. Ruighaver",
  title =        "An electro-optic data communication system for the
                 {Delft} parallel processor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "15",
  number =       "6",
  pages =        "2--8",
  month =        dec,
  year =         "1987",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:28 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Shippen:1987:TTD,
  author =       "G. B. Shippen and J. K. Archibald",
  title =        "A tagged token dataflow machine for computing small,
                 iterative algorithms",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "15",
  number =       "6",
  pages =        "9--18",
  month =        dec,
  year =         "1987",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:28 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Penn:1988:PSI,
  author =       "Clif Penn",
  title =        "Preface to the {Special} issue on {Neural Networks}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "1",
  pages =        "6--6",
  month =        mar,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:31 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Lippmann:1988:ICN,
  author =       "Richard P. Lippmann",
  title =        "An introduction to computing with neural nets",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "1",
  pages =        "7--25",
  month =        mar,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:31 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Anderson:1988:SNN,
  author =       "James A. Anderson and Edward J. Wisniewski and Susan
                 R. Viscuso",
  title =        "Software for neural networks",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "1",
  pages =        "26--36",
  month =        mar,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:31 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Garth:1988:ISN,
  author =       "Simon Garth and Danny Pike",
  title =        "An integrated system for neural network simulations",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "1",
  pages =        "37--44",
  month =        mar,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:31 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Maren:1988:CRI,
  author =       "A. Jean Maren",
  title =        "Conference report: {IEEE First International
                 Conference on Neural Networks}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "1",
  pages =        "45--46",
  month =        mar,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:31 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Dongarra:1988:PVC,
  author =       "Jack J. Dongarra",
  title =        "Performance of various computers using standard linear
                 equations software in a {FORTRAN} environment",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "1",
  pages =        "47--69",
  month =        mar,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:31 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Wulf:1988:WCA,
  author =       "Wm. A. Wulf",
  title =        "The {WM} computer architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "1",
  pages =        "70--84",
  month =        mar,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:31 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Tabak:1988:LIM,
  author =       "Daniel Tabak",
  title =        "Logarithmic indices for multiprocessor evaluation",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "1",
  pages =        "85--90",
  month =        mar,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:31 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Dowd:1988:ERV,
  author =       "Martin Dowd",
  title =        "An example {RISC} vector machine architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "1",
  pages =        "91--99",
  month =        mar,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:31 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Dowd:1988:RVC,
  author =       "Martin Dowd",
  title =        "{RISC} vector {CPU}'s and crossbars in desktops",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "1",
  pages =        "100--102",
  month =        mar,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:31 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Lass:1988:MIO,
  author =       "Stanley Lass",
  title =        "Multiple instructions\slash operands per access to
                 cache memory",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "1",
  pages =        "103--103",
  month =        mar,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:31 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Gass:1988:WRS,
  author =       "Wanda Gass",
  title =        "Workshop report: synthesis of foo bars",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "1",
  pages =        "104--108",
  month =        mar,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:31 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ferguson:1988:BRL,
  author =       "F. Joel Ferguson",
  title =        "Book Review: {{\em Logic Design Principles\/}} by
                 {Edward J. McCluskey, Prentice-Hall Publishers,
                 Englewood Cliffs, New Jersey, 549 pp., \$39.95}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "1",
  pages =        "109--109",
  month =        mar,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:31 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ghosh:1988:CIM,
  author =       "J. Ghosh and K. Hwang",
  title =        "Critical issues in mapping neural networks on
                 message-passing multicomputers",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "2",
  pages =        "3--11",
  month =        may,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:45 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Takefuji:1988:MCS,
  author =       "Y. Takefuji and R. Jannarone and Y. B. Cho and T.
                 Chen",
  title =        "Multinomial conjunctoid statistical learning
                 machines",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "2",
  pages =        "12--17",
  month =        may,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:45 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Louri:1988:BPA,
  author =       "A. Louri and K. Hwang",
  title =        "A bit-plane architecture for optical computing with
                 two-dimensional symbolic substitution",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "2",
  pages =        "18--27",
  month =        may,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:45 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Fiske:1988:RAP,
  author =       "S. Fiske and W. J. Dally",
  title =        "The reconfigurable arithmetic processor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "2",
  pages =        "30--36",
  month =        may,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:45 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Pleszkun:1988:PPM,
  author =       "A. R. Pleszkun and G. S. Sohi",
  title =        "The performance potential of multiple functional unit
                 processors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "2",
  pages =        "37--44",
  month =        may,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:45 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hwu:1988:EPM,
  author =       "W. W. Hwu and P. P. Chang",
  title =        "Exploiting parallel microprocessor microarchitectures
                 with a compiler code generator",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "2",
  pages =        "45--53",
  month =        may,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:45 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{McNiven:1988:AMR,
  author =       "G. D. McNiven and E. S. Davidson",
  title =        "Analysis of memory referencing behavior for design of
                 local memories",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "2",
  pages =        "56--63",
  month =        may,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:45 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Eickenmeyer:1988:PEC,
  author =       "R. J. Eickenmeyer and J. H. Patel",
  title =        "Performance evaluation of on-chip register and cache
                 organizations",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "2",
  pages =        "64--72",
  month =        may,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:45 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Baer:1988:IPM,
  author =       "J.-L. Baer and W.-H. Wang",
  title =        "On the inclusion properties for multi-level cache
                 hierarchies",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "2",
  pages =        "73--80",
  month =        may,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:45 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Short:1988:SST,
  author =       "R. T. Short and H. M. Levy",
  title =        "A simulation study of two-level caches",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "2",
  pages =        "81--88",
  month =        may,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:45 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Chow:1988:HNH,
  author =       "E. Chow and H. Madan and J. Peterson and D. Grunwald
                 and D. Reed",
  title =        "Hyperswitch network for the hypercube computer",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "2",
  pages =        "90--99",
  month =        may,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:45 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Winsor:1988:ABH,
  author =       "D. C. Winsor and T. N. Mudge",
  title =        "Analysis of bus hierarchies for multiprocessors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "2",
  pages =        "100--107",
  month =        may,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:45 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Wei:1988:EGN,
  author =       "S. Wei and G. Lee",
  title =        "Extra group network: a cost-effective fault-tolerant
                 multistage interconnection network",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "2",
  pages =        "108--115",
  month =        may,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:45 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Jiang:1988:PMB,
  author =       "H. Jiang and K. C. Smith",
  title =        "A partial-multiple-bus computer structure with
                 improved cost effectiveness",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "2",
  pages =        "116--122",
  month =        may,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:45 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Watson:1988:FPA,
  author =       "I. Watson and V. Woods and P. Watson and R. Banach and
                 M. Greenberg and J. Sargeant",
  title =        "{Flagship}: a parallel architecture for declarative
                 programming",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "2",
  pages =        "124--130",
  month =        may,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:45 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Iannucci:1988:TDN,
  author =       "R. A. Iannucci",
  title =        "Toward a dataflow\slash {von Neumann} hybrid
                 architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "2",
  pages =        "131--140",
  month =        may,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:45 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Culler:1988:RRD,
  author =       "D. E. Culler and Arvind",
  title =        "Resource requirements of dataflow programs",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "2",
  pages =        "141--150",
  month =        may,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:45 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Sprunt:1988:PDP,
  author =       "B. Sprunt and D. Kirk and L. Sha",
  title =        "Priority-driven, preemptive {I/O} controllers for
                 real-time systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "2",
  pages =        "152--159",
  month =        may,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:45 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Shukla:1988:KIP,
  author =       "S. B. Shukla and D. P. Agrawal",
  title =        "A kernel-independent, pipelined architecture for
                 real-time {$2$-D} convolution",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "2",
  pages =        "160--166",
  month =        may,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:45 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Liu:1988:EBL,
  author =       "W. Liu and T.-F. Yeh and W. E. Batchelor and R.
                 Cavin",
  title =        "Exploiting bit level concurrency in real-time
                 geometric feature extractions",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "2",
  pages =        "167--174",
  month =        may,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:45 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Clark:1988:MVP,
  author =       "D. W. Clark and P. J. Bannon and J. B. Keller",
  title =        "Measuring {VAX 8800} performance with a histogram
                 hardware monitor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "2",
  pages =        "176--185",
  month =        may,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:45 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Sites:1988:MCA,
  author =       "R. L. Sites and A. Agarwal",
  title =        "Multiprocessor cache analysis using {ATUM}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "2",
  pages =        "186--195",
  month =        may,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:45 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ng:1988:TOB,
  author =       "S. Ng and D. Lang and R. Selinger",
  title =        "Trade-offs between devices and paths in achieving disk
                 interleaving",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "2",
  pages =        "196--201",
  month =        may,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:45 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Jainandunsing:1988:DCC,
  author =       "K. Jainandunsing and E. F. Deprettere",
  title =        "Design of a concurrent computer for solving systems of
                 linear equations",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "2",
  pages =        "204--211",
  month =        may,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:45 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Wolfe:1988:WDH,
  author =       "A. Wolfe and M. {Breternitz, Jr.} and C. Stephens and
                 A. L. Ting and D. B. Kirk and R. P. {Bianchini, Jr.}
                 and J. P. Shen",
  title =        "The white dwarf: a high-performance
                 application-specific processor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "2",
  pages =        "212--222",
  month =        may,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:45 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Gaudiot:1988:SPD,
  author =       "J. L. Gaudiot and C. M. Lin and M. Hosseiniyar",
  title =        "Solving partial differential equations in a
                 data-driven multiprocessor environment",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "2",
  pages =        "223--230",
  month =        may,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:45 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Lee:1988:SSP,
  author =       "D. Lee",
  title =        "Scrambled storage for parallel memory systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "2",
  pages =        "232--239",
  month =        may,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:45 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Krishnaswamy:1988:ALC,
  author =       "V. Krishnaswamy and S. Ahuja and N. Carriero and D.
                 Gelernter",
  title =        "The architecture of a {Linda} coprocessor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "2",
  pages =        "240--249",
  month =        may,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:45 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kung:1988:DAS,
  author =       "H. T. Kung",
  title =        "Deadlock avoidance for systolic communication",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "2",
  pages =        "252--260",
  month =        may,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:45 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{So:1988:CPV,
  author =       "K. So and V. Zecca",
  title =        "Cache performance of vector processors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "2",
  pages =        "261--268",
  month =        may,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:45 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Vernon:1988:DRR,
  author =       "M. K. Vernon and U. Manber",
  title =        "Distributed round-robin and first-come first-serve
                 protocols and their applications to multiprocessor bus
                 arbitration",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "2",
  pages =        "269--279",
  month =        may,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:45 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Agarwal:1988:EDS,
  author =       "A. Agarwal and R. Simoni and J. Hennessy and M.
                 Horowitz",
  title =        "An evaluation of directory schemes for cache
                 coherence",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "2",
  pages =        "280--298",
  month =        may,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:45 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Prybylski:1988:PTC,
  author =       "S. Prybylski and M. Horowitz and J. Hennessy",
  title =        "Performance tradeoffs in cache design",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "2",
  pages =        "290--298",
  month =        may,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:45 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Cheong:1988:CCS,
  author =       "H. Cheong and A. V. Vaidenbaum",
  title =        "A cache coherence scheme with fast selective
                 invalidation",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "2",
  pages =        "299--307",
  month =        may,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:45 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Vernon:1988:AEP,
  author =       "M. K. Vernon and E. D. Lazowska and J. Zahorjan",
  title =        "An accurate and efficient performance analysis
                 technique for multiprocessor snooping cache-consistency
                 protocols",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "2",
  pages =        "308--315",
  month =        may,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:45 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Rau:1988:DTR,
  author =       "D. Rau and J. A. B. Fortes and H. J. Siegel",
  title =        "Destination tag routing techniques based on a state
                 model for the {LADM} network",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "2",
  pages =        "318--324",
  month =        may,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:45 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kim:1988:RCB,
  author =       "D. W. Kim and G. J. Lipovski and A. Hartmann and R.
                 Jenevein",
  title =        "Regular {CC}-banyan networks",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "2",
  pages =        "325--332",
  month =        may,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:45 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Jenevein:1988:TAR,
  author =       "R. M. Jenevein and T. Mookken",
  title =        "Traffic analysis of rectangular {SW}-banyan networks",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "2",
  pages =        "333--342",
  month =        may,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:45 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Tamir:1988:HPM,
  author =       "Y. Tamir and G. L. Frazier",
  title =        "High-performance multi-queue buffers for {VLSI}
                 communications switches",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "2",
  pages =        "343--354",
  month =        may,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:45 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Preiss:1988:CBM,
  author =       "B. R. Preiss and V. C. Hamacher",
  title =        "A cache-based message passing scheme for a shared-bus
                 multiprocessor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "2",
  pages =        "358--364",
  month =        may,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:45 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Boku:1988:IHP,
  author =       "T. Boku and S. Nomura and H. Amano",
  title =        "{IMPULSE}: a high performance processing unit for
                 multiprocessors for scientific calculation",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "2",
  pages =        "365--372",
  month =        may,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:45 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Eggers:1988:CSP,
  author =       "S. J. Eggers and R. H. Katz",
  title =        "A characterization of sharing in parallel programs and
                 its application to coherency protocol evaluation",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "2",
  pages =        "373--382",
  month =        may,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:45 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Lipovski:1988:FOI,
  author =       "G. J. Lipovski and P. Vaughan",
  title =        "A fetch-and-op implementation for parallel computers",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "2",
  pages =        "384--392",
  month =        may,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:45 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Seznec:1988:SPT,
  author =       "A. Seznec and Y. J{\'e}gou",
  title =        "Synchronizing processors through memory requests in a
                 tightly coupled multiprocessor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "2",
  pages =        "393--400",
  month =        may,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:45 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Fujimoto:1988:DPS,
  author =       "R. M. Fujimoto and J.-J. Tsai and G. Gopalakrishnan",
  title =        "Design and performance of special purpose hardware for
                 time warp",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "2",
  pages =        "401--409",
  month =        may,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:45 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Cheriton:1988:VMI,
  author =       "D. R. Cheriton and A. Gupta and P. D. Boyle and H. A.
                 Goosen",
  title =        "The {VMP} multiprocessor: initial experience,
                 refinements, and performance evaluation",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "2",
  pages =        "410--421",
  month =        may,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:45 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Goodman:1988:WMN,
  author =       "J. R. Goodman and P. J. Woest",
  title =        "The {Wisconsin} multicube: a new large-scale
                 cache-coherent multiprocessor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "2",
  pages =        "422--431",
  month =        may,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:45 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Tick:1988:DBP,
  author =       "E. Tick",
  title =        "Data buffer performance for sequential {Prolog}
                 architectures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "2",
  pages =        "434--442",
  month =        may,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:45 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Halstead:1988:MMP,
  author =       "R. H. {Halstead, Jr.} and T. Fujita",
  title =        "{MASA}: a multithreaded processor architecture for
                 parallel symbolic computing",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "2",
  pages =        "443--451",
  month =        may,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:45 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Butler:1988:PAO,
  author =       "P. L. Butler and J. D. {Allen, Jr.} and D. W.
                 Bouldin",
  title =        "Parallel architecture for {OPS5}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "2",
  pages =        "452--457",
  month =        may,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:45 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Cheriton:1988:CCM,
  author =       "David R. Cheriton and Pat Boyle and Gert A.
                 Slavenburg",
  title =        "Comments on {``Coherency for multiprocessor virtual
                 addresses caches''} by {James R. Goodman}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "3",
  pages =        "3--6",
  month =        jun,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:55 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Goodman:1988:RDR,
  author =       "James R. Goodman",
  title =        "Reply to {David R. Cheriton's, Pat Boyle's, and Gert
                 A. Slavenburg's ``Comments on 'Coherency for
                 multiprocessor virtual addressed caches''\,' by James
                 R. Goodman}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "3",
  pages =        "7--7",
  month =        jun,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:55 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Rabbat:1988:TDC,
  author =       "Guy Rabbat and Borko Furht and Ron Kibler",
  title =        "Three-dimensional computers and measuring their
                 performance",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "3",
  pages =        "9--16",
  month =        jun,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:55 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Castan:1988:MPG,
  author =       "M. Castan and A. Contessa and E. Cousin and C. Coustet
                 and B. Lecussan",
  title =        "{MaRs}: a parallel graph reduction multiprocessor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "3",
  pages =        "17--24",
  month =        jun,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:55 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Contessa:1988:AFT,
  author =       "Alessandro Contessa",
  title =        "An approach to fault tolerance and error recovery in a
                 parallel graph reduction machine: {MaRS}---a case
                 study",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "3",
  pages =        "25--32",
  month =        jun,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:55 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Crawford:1988:EHH,
  author =       "Chuck Crawford",
  title =        "Evolution of the {Harris H-series} computers and
                 speculations on their future",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "3",
  pages =        "33--39",
  month =        jun,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:55 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Good:1988:SIC,
  author =       "Philip L. Good",
  title =        "Structuring an instruction cache",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "3",
  pages =        "40--43",
  month =        jun,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:55 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Johnson:1988:CMM,
  author =       "Eric E. Johnson",
  title =        "Completing an {MIMD} multiprocessor taxonomy",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "3",
  pages =        "44--47",
  month =        jun,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:55 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Jones:1988:UR,
  author =       "Douglas W. Jones",
  title =        "The ultimate {RISC}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "3",
  pages =        "48--55",
  month =        jun,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:55 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Jones:1988:MC,
  author =       "Douglas W. Jones",
  title =        "A minimal {CISC}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "3",
  pages =        "56--63",
  month =        jun,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:55 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Lass:1988:SCM,
  author =       "Stanley Lass",
  title =        "Shared cache multiprocessing with pack computers",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "3",
  pages =        "64--70",
  month =        jun,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:55 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Jouppi:1988:SVS,
  author =       "Norman P. Jouppi",
  title =        "Superscalar vs. superpipelined machines",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "3",
  pages =        "71--80",
  month =        jun,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:55 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Schachter:1988:BRH,
  author =       "Lorne H. Schachter",
  title =        "Book review of {{\em High-Performance Computer
                 Architecture\/}} by {Harold S. Stone. Addison-Wesley
                 1987}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "3",
  pages =        "81--84",
  month =        jun,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:55 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ramachandran:1988:PSI,
  author =       "Umakishore Ramachandran",
  title =        "Preface to the {Special Issue on Architectural Support
                 for Operating Systems}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "4",
  pages =        "11--11",
  month =        sep,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:11 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Asthana:1988:IMS,
  author =       "A. Asthana and H. V. Jagadish and J. A. Chandross and
                 D. Lin and S. C. Knauer",
  title =        "An intelligent memory system",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "4",
  pages =        "12--20",
  month =        sep,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:11 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Beltrametti:1988:CMM,
  author =       "Monica Beltrametti and Kenneth Bobey and John R.
                 Zorbas",
  title =        "The control mechanism for the {Myrias} parallel
                 computer system",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "4",
  pages =        "21--30",
  month =        sep,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:11 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Finkel:1988:YSM,
  author =       "Raphael Finkel and Debra Hengsen",
  title =        "{YACKOS} on a shared-memory multiprocessor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "4",
  pages =        "31--36",
  month =        sep,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:11 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Pucci:1988:OCE,
  author =       "Marc F. Pucci and J. L. Alberi",
  title =        "Optimized communication in an extended remote
                 procedure call model",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "4",
  pages =        "37--46",
  month =        sep,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:11 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Cortadella:1988:DRC,
  author =       "Jordi Cortadella and Teodor Jov{\'e}",
  title =        "Dynamic {RAM} for on-chip instruction caches",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "4",
  pages =        "45--50",
  month =        sep,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:11 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Naderi:1988:MPEa,
  author =       "M. Naderi",
  title =        "Modelling and performance evaluation of
                 multiprocessors organization with shared memories",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "4",
  pages =        "51--74",
  month =        sep,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:11 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Gehringer:1988:SCP,
  author =       "Edward Gehringer and Janne Abullarade and Michael H.
                 Gulyn",
  title =        "A survey of commercial parallel processors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "4",
  pages =        "75--107",
  month =        sep,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:11 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Lease:1988:CPS,
  author =       "Mark Lease and Mac Lively",
  title =        "Comparing production system architectures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "4",
  pages =        "108--116",
  month =        sep,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:11 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Page:1988:FAH,
  author =       "Ivor Page and Jeff Niehaus",
  title =        "The {Flex} architecture, a high speed graphics
                 processor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "4",
  pages =        "117--129",
  month =        sep,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:11 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Murakami:1988:OKU,
  author =       "Kazuaki Murakami and Akira Fukuda and Toshinori
                 Sueyoshi and Shinji Tomita",
  title =        "An overview of the {Kyushu University} reconfigurable
                 parallel processor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "4",
  pages =        "130--137",
  month =        sep,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:11 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Percus:1988:SRC,
  author =       "Ora E. Percus and J. K. Percus",
  title =        "Some results concerning clock-regulated queues",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "4",
  pages =        "138--144",
  month =        sep,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:11 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Williams:1988:SSS,
  author =       "Fleur Liane Williams",
  title =        "Should {SCC} set condition codes?",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "4",
  pages =        "145--149",
  month =        sep,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:11 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Steven:1988:NEA,
  author =       "Gordon B. Steven",
  title =        "A novel effective address calculation mechanism for
                 {RISC} microprocessors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "4",
  pages =        "150--156",
  month =        sep,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:11 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Parhami:1988:DFV,
  author =       "Behrooz Parhami",
  title =        "From defects to failures: a view of dependable
                 computing",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "4",
  pages =        "157--168",
  month =        sep,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:11 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Patterson:1988:RP,
  author =       "David A. Patterson",
  title =        "{RISCY} patents",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "4",
  pages =        "169--191",
  month =        sep,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:11 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Takacs:1988:BRV,
  author =       "Helen C. Takacs",
  title =        "Book review: {{\em A VLSI Architecture for Concurrent
                 Data Structures\/}} by {William J. Dally (Kluwer
                 1988)}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "4",
  pages =        "192--193",
  month =        sep,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:11 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Colwell:1988:BRC,
  author =       "Robert P. Colwell",
  title =        "Book review: {{\em Computer Architecture and
                 Organization}}, 2nd ed. by {John P. Hayes (McGraw Hill,
                 1988)}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "4",
  pages =        "193--195",
  month =        sep,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:11 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{McDowell:1988:BRS,
  author =       "Charles E. McDowell",
  title =        "Book review: {{\em Supercomputer Architectures\/}} by
                 {Paul B. Schneck (Kluwer Academic Publishers)}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "4",
  pages =        "195--196",
  month =        sep,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:11 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hum:1988:SWF,
  author =       "Herbert H. J. Hum and Guang R. Gao",
  title =        "Summary of the workshop on frontiers in functional
                 programming and dataflow architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "5",
  pages =        "12--19",
  month =        dec,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:19 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{vanTilborg:1988:IDC,
  author =       "Andre M. van Tilborg",
  title =        "Instrumentation for distributed computing systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "5",
  pages =        "20--25",
  month =        dec,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:19 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Griffin:1988:UUR,
  author =       "Glenn W. Griffin",
  title =        "The ultimate ultimate {RISC}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "5",
  pages =        "26--32",
  month =        dec,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:19 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Jones:1988:RCR,
  author =       "Douglas W. Jones",
  title =        "Risks of comparing {RISCs}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "5",
  pages =        "33--34",
  month =        dec,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:19 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Naderi:1988:MPEb,
  author =       "M. Naderi",
  title =        "Modelling and performance evaluation of
                 multiprocessors, organizations with multi-memory
                 units",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "5",
  pages =        "35--51",
  month =        dec,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:19 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kogge:1988:VRB,
  author =       "Peter Kogge and John Oldfield and Mark Brule and
                 Charles Stormon",
  title =        "{VLSI} and rule-based systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "5",
  pages =        "52--65",
  month =        dec,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:19 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Parhami:1988:BRM,
  author =       "Behrooz Parhami",
  title =        "Book review: {{\em Memory Storage Patterns in Parallel
                 Processing\/}} by {Mary A. Mace (Kluwer Academic
                 Publishers, Boston, 1987, 139 pp.)}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "16",
  number =       "5",
  pages =        "76--76",
  month =        dec,
  year =         "1988",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:19 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Moskowitz:1989:AMM,
  author =       "J. P. Moskowitz and C. Jousselin",
  title =        "An algebraic memory model",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "1",
  pages =        "55--62",
  month =        mar,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:32 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Wong:1989:SAS,
  author =       "W. F. Wong",
  title =        "A stack addressing scheme based on windowing",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "1",
  pages =        "63--69",
  month =        mar,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:32 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Anonymous:1989:PTD,
  author =       "Anonymous",
  title =        "Pipelining through {Dynamic Control ROM}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "1",
  pages =        "70--72",
  month =        mar,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:32 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Lass:1989:SIC,
  author =       "Stanley E. Lass",
  title =        "Some innovations in computer architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "1",
  pages =        "73--77",
  month =        mar,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:32 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Bitar:1989:BRR,
  author =       "Philip Bitar",
  title =        "Book reviews: Review of {{\em Parallel Execution of
                 Logic Programs\/}} by {John Conery. Kluwer Academic
                 Publishers 1987}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "1",
  pages =        "81--82",
  month =        mar,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:32 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Cohn:1989:ACT,
  author =       "Robert Cohn and Thomas Gross and Monica Lam",
  title =        "Architecture and compiler tradeoffs for a long
                 instruction word processor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "2",
  pages =        "2--14",
  month =        apr,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:39 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Sohi:1989:TIF,
  author =       "Gurindar S. Sohi and Sriram Vajapeyam",
  title =        "Tradeoffs in instruction format design for horizontal
                 architectures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "2",
  pages =        "15--25",
  month =        apr,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:39 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Dehnert:1989:OLS,
  author =       "James C. Dehnert and Peter Y.-T. Hsu and Joseph P.
                 Bratt",
  title =        "Overlapped loop support in the {Cydra 5}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "2",
  pages =        "26--38",
  month =        apr,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:39 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Burkowski:1989:ASS,
  author =       "F. J. Burkowski and G. V. Cormack and G. D. P. Dueck",
  title =        "Architectural support for synchronous task
                 communication",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "2",
  pages =        "40--53",
  month =        apr,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:39 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Gupta:1989:FBM,
  author =       "Rajiv Gupta",
  title =        "The fuzzy barrier: a mechanism for high speed
                 synchronization of processors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "2",
  pages =        "54--63",
  month =        apr,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:39 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Goodman:1989:ESP,
  author =       "James R. Goodman and Mary K. Vernon and Philip J.
                 Woest",
  title =        "Efficient synchronization primitives for large-scale
                 cache-coherent multiprocessors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "2",
  pages =        "64--75",
  month =        apr,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:39 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Mellor-Crummey:1989:SIC,
  author =       "J. M. Mellor-Crummey and T. J. LeBlanc",
  title =        "A software instruction counter",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "2",
  pages =        "78--86",
  month =        apr,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:39 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Aral:1989:EDP,
  author =       "Z. Aral and I. Gerther and G. Schaffer",
  title =        "Efficient debugging primitives for multiprocessors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "2",
  pages =        "87--95",
  month =        apr,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:39 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Staknis:1989:SMA,
  author =       "M. E. Staknis",
  title =        "Sheaved memory: architectural support for state saving
                 and restoration in pages systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "2",
  pages =        "96--102",
  month =        apr,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:39 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Holliday:1989:RHP,
  author =       "M. A. Holliday",
  title =        "Reference history, page size, and migration daemons in
                 local\slash remote architectures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "2",
  pages =        "104--112",
  month =        apr,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:39 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Black:1989:TLB,
  author =       "D. L. Black and R. F. Rashid and D. B. Golub and C. R.
                 Hill",
  title =        "Translation lookaside buffer consistency: a software
                 approach",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "2",
  pages =        "113--122",
  month =        apr,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:39 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Gibson:1989:FCT,
  author =       "G. A. Gibson and L. Hellerstein and R. M. Karp and D.
                 A. Patterson",
  title =        "Failure correction techniques for large disk arrays",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "2",
  pages =        "123--132",
  month =        apr,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:39 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Jouppi:1989:UVS,
  author =       "N. P. Jouppi and J. Bertoni and D. W. Wall",
  title =        "A unified vector\slash scalar floating-point
                 architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "2",
  pages =        "134--143",
  month =        apr,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:39 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Mulder:1989:DBR,
  author =       "H. Mulder",
  title =        "Data buffering: run-time versus compile-time support",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "2",
  pages =        "144--151",
  month =        apr,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:39 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Adams:1989:AIS,
  author =       "T. L. Adams and R. E. Zimmerman",
  title =        "An analysis of 8086 instruction set usage in {MS DOS}
                 programs",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "2",
  pages =        "152--160",
  month =        apr,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:39 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Roos:1989:RTS,
  author =       "J. Roos",
  title =        "A real-time support processor for {Ada} tasking",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "2",
  pages =        "162--171",
  month =        apr,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:39 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Vegdahl:1989:RES,
  author =       "Steven R. Vegdahl and Uwe F. Pleban",
  title =        "The runtime environment for {Scheme}, a {Scheme}
                 implementation on the 88000",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "2",
  pages =        "172--182",
  month =        apr,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:39 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{McFarling:1989:POI,
  author =       "S. McFarling",
  title =        "Program optimization for instruction caches",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "2",
  pages =        "183--191",
  month =        apr,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:39 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Karger:1989:URO,
  author =       "Paul A. Karger",
  title =        "Using registers to optimize cross-domain call
                 performance",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "2",
  pages =        "194--204",
  month =        apr,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:39 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Arnould:1989:DNN,
  author =       "Emmanuel Arnould and H. T. Kung and Fran{\c{c}}ois
                 Bitz and Robert D. Sansom and Eric C. Cooperm",
  title =        "The design of nectar: a network backplane for
                 heterogeneous multicomputers",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "2",
  pages =        "205--216",
  month =        apr,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:39 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Delgado-Rannauro:1989:MDP,
  author =       "S. A. Delgado-Rannauro and T. J. Reynolds",
  title =        "A message driven {OR}-parallel machine",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "2",
  pages =        "217--228",
  month =        apr,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:39 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Owicki:1989:EPS,
  author =       "S. Owicki and A. Agarwal",
  title =        "Evaluating the performance of software cache
                 coherence",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "2",
  pages =        "230--242",
  month =        apr,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:39 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Weber:1989:ACI,
  author =       "W. Weber and A. Gupta",
  title =        "Analysis of cache invalidation patterns in
                 multiprocessors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "2",
  pages =        "243--256",
  month =        apr,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:39 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Eggers:1989:ESC,
  author =       "S. J. Eggers and R. H. Katz",
  title =        "The effect of sharing on the cache and bus performance
                 of parallel programs",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "2",
  pages =        "257--270",
  month =        apr,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:39 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Jouppi:1989:AIL,
  author =       "N. P. Jouppi and D. W. Wall",
  title =        "Available instruction-level parallelism for
                 superscalar and superpipelined machines",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "2",
  pages =        "272--282",
  month =        apr,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:39 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Dally:1989:MOF,
  author =       "W. J. Dally",
  title =        "Micro-optimization of floating-point operations",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "2",
  pages =        "283--289",
  month =        apr,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:39 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Smith:1989:LMI,
  author =       "M. D. Smith and M. Johnson and M. A. Horowitz",
  title =        "Limits on multiple instruction issue",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "2",
  pages =        "290--302",
  month =        apr,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:39 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Eggers:1989:EPF,
  author =       "S. J. Eggers and R. H. Katz",
  title =        "Evaluating the performance of four snooping cache
                 coherency protocols",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "3",
  pages =        "2--15",
  month =        jun,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:56 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Cheriton:1989:MLS,
  author =       "D. R. Cheriton and H. A. Goosen and P. D. Boyle",
  title =        "Multi-level shared caching techniques for scalability
                 in {VMP-M/C}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "3",
  pages =        "16--24",
  month =        jun,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:56 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Goto:1989:DPC,
  author =       "A. Goto and A. Matsumoto and E. Tick",
  title =        "Design and performance of a coherent cache for
                 parallel logic programming architectures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "3",
  pages =        "25--33",
  month =        jun,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:56 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Grafe:1989:EDP,
  author =       "V. G. Grafe and G. S. Davidson and J. E. Hoch and V.
                 P. Holmes",
  title =        "The {Epsilon} dataflow processor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "3",
  pages =        "36--45",
  month =        jun,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:56 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Sakai:1989:ADS,
  author =       "S. Sakai and y. Yamaguchi and K. Hiraki and Y. Kodama
                 and T. Yuba",
  title =        "An architecture of a dataflow single chip processor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "3",
  pages =        "46--53",
  month =        jun,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:56 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Nitezki:1989:EDP,
  author =       "P. Nitezki",
  title =        "Exploiting data parallelism in signal processing on a
                 dataflow machine",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "3",
  pages =        "54--61",
  month =        jun,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:56 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ibbett:1989:AMS,
  author =       "R. N. Ibbett and T. M. Hopkins and K. I. M. McKinnon",
  title =        "Architectural mechanisms to support sparse vector
                 processing",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "3",
  pages =        "64--71",
  month =        jun,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:56 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Harper:1989:DSS,
  author =       "D. T. Harper and D. A. Linebarger",
  title =        "A dynamic storage scheme for conflict-free vector
                 access",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "3",
  pages =        "72--77",
  month =        jun,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:56 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Murakami:1989:SSI,
  author =       "K. Murakami and N. Irie and S. Tomita",
  title =        "{SIMP} (Single Instruction stream\slash Multiple
                 instruction Pipelining): a novel high-speed
                 single-processor architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "3",
  pages =        "78--85",
  month =        jun,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:56 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ben-Asher:1989:DSA,
  author =       "Y. Ben-Asher and D. Egozi and A. Schuster",
  title =        "{$2$-D SIMD} algorithms in the perfect shuffle
                 networks",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "3",
  pages =        "88--95",
  month =        jun,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:56 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Valero-Garcia:1989:SHA,
  author =       "M. Valero-Garcia and J. J. Navarro and J. M. Llaberia
                 and M. Valero",
  title =        "Systematic hardware adaptation of systolic
                 algorithms",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "3",
  pages =        "96--104",
  month =        jun,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:56 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Chen:1989:TMH,
  author =       "M.-S. Chen and K. G. Shin",
  title =        "Task migration in hypercube multiprocessors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "3",
  pages =        "105--111",
  month =        jun,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:56 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Przybylski:1989:CPO,
  author =       "S. Przybylski and M. Horowitz and J. Hennessy",
  title =        "Characteristics of performance-optimal multi-level
                 cache hierarchies",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "3",
  pages =        "114--121",
  month =        jun,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:56 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Wood:1989:SRD,
  author =       "D. A. Wood and R. H. Katz",
  title =        "Supporting reference and dirty bits in {SPUR}'s
                 virtual address cache",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "3",
  pages =        "122--130",
  month =        jun,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:56 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kessler:1989:IIS,
  author =       "R. E. Kessler and R. Jooss and A. Lebeck and M. D.
                 Hill",
  title =        "Inexpensive implementations of set-associativity",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "3",
  pages =        "131--139",
  month =        jun,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:56 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Wang:1989:OPT,
  author =       "W. H. Wang and J.-L. Baer and H. M. Levy",
  title =        "Organization and performance of a two-level
                 virtual-real cache hierarchy",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "3",
  pages =        "140--148",
  month =        jun,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:56 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Jesshope:1989:HPC,
  author =       "C. R. Jesshope and P. R. Miller and J. T. Yantchev",
  title =        "High performance communications in processor
                 networks",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "3",
  pages =        "150--157",
  month =        jun,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:56 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Mizrahi:1989:IMS,
  author =       "H. E. Mizrahi and J. L. Baer and E. D. Lazowska and J.
                 Zahorjan",
  title =        "Introducing memory into the switch elements of
                 multiprocessor interconnection networks",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "3",
  pages =        "158--166",
  month =        jun,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:56 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Scott:1989:UFC,
  author =       "S. L. Scott and G. S. Sohi",
  title =        "Using feedback to control tree saturation in
                 multistage interconnection networks",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "3",
  pages =        "167--176",
  month =        jun,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:56 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ezhilchelvan:1989:CRS,
  author =       "P. D. Ezhilchelvan and S. K. Shrivastava and A.
                 Tully",
  title =        "Constructing replicated systems using processors with
                 point-to-point communication links",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "3",
  pages =        "177--184",
  month =        jun,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:56 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Benker:1989:KKC,
  author =       "H. Benker and J. M. Beacco and M. Dorochevsky and Th.
                 Jeffr{\'e} and A. P{\"o}hlmann and J. Noy{\'e} and B.
                 Poterie and J. C. Syre and O. Thibault and G.
                 Watzlawik",
  title =        "{KCM}: a knowledge crunching machine",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "3",
  pages =        "186--194",
  month =        jun,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:56 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Singhal:1989:HPP,
  author =       "A. Singhal and Y. N. Patt",
  title =        "A high performance {Prolog} processor with multiple
                 function units",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "3",
  pages =        "195--202",
  month =        jun,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:56 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Morioka:1989:EMS,
  author =       "M. Morioka and S. Yamaguchi and T. Bandoh",
  title =        "Evaluation of memory system for integrated {Prolog}
                 processor {IPP}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "3",
  pages =        "203--210",
  month =        jun,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:56 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Wong:1989:TDH,
  author =       "K.-F. Wong and M. H. Williams",
  title =        "A type driven hardware engine for {Prolog} clause
                 retrieval over a large knowledge base",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "3",
  pages =        "211--222",
  month =        jun,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:56 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hwu:1989:CSH,
  author =       "W. W. Hwu and T. M. Conte and P. P. Chang",
  title =        "Comparing software and hardware schemes for reducing
                 the cost of branches",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "3",
  pages =        "224--233",
  month =        jun,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:56 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Farrens:1989:IPS,
  author =       "M. K. Farrens and a. R. Pleszkun",
  title =        "Improving performance of small on-chip instruction
                 caches",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "3",
  pages =        "234--241",
  month =        jun,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:56 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hwu:1989:AHI,
  author =       "W. W. Hwu and P. P. Chang",
  title =        "Achieving high instruction cache performance with an
                 optimizing compiler",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "3",
  pages =        "242--251",
  month =        jun,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:56 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Steenkiste:1989:ICD,
  author =       "P. Steenkiste",
  title =        "The impact of code density on instruction cache
                 performance",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "3",
  pages =        "252--259",
  month =        jun,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:56 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Nikhil:1989:CDS,
  author =       "R. S. Nikhil",
  title =        "Can dataflow subsume {von Neumann} computing?",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "3",
  pages =        "262--272",
  month =        jun,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:56 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Weber:1989:EBM,
  author =       "W.-D. Weber and A. Gupta",
  title =        "Exploring the benefits of multiple hardware contexts
                 in a multiprocessor architecture: preliminary results",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "3",
  pages =        "273--280",
  month =        jun,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:56 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Jouppi:1989:AOT,
  author =       "N. P. Jouppi",
  title =        "Architectural and organizational tradeoffs in the
                 design of the {MultiTitan CPU}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "3",
  pages =        "281--289",
  month =        jun,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:56 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Sato:1989:RTC,
  author =       "M. Sato and S. Ichikawa and E. Goto",
  title =        "Run-time checking in {Lisp} by integrating memory
                 addressing and range checking",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "3",
  pages =        "290--297",
  month =        jun,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:56 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hopper:1989:MVW,
  author =       "A. Hopper and A. Jones and D. Lioupis",
  title =        "Multiple vs. wide shared bus multiprocessors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "3",
  pages =        "300--306",
  month =        jun,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:56 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Annaratone:1989:PMC,
  author =       "M. Annaratone and R. R{\"u}hl",
  title =        "Performance measurements on a commercial
                 multiprocessor running parallel code",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "3",
  pages =        "307--314",
  month =        jun,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:56 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Annaratone:1989:ICS,
  author =       "M. Annaratone and C. Pommerell and R. R{\"u}hl",
  title =        "Interprocessor communication speed and performance in
                 distributed-memory parallel processors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "3",
  pages =        "315--324",
  month =        jun,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:56 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ghosal:1989:ACC,
  author =       "D. S. Ghosal and S. K. Tripathi and L. N. Bhuyan and
                 H. Jiang",
  title =        "Analysis of computation-communication issues in
                 dynamic dataflow architectures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "3",
  pages =        "325--333",
  month =        jun,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:56 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kravitz:1989:LSM,
  author =       "S. Kravitz and R. E. Bryant and R. Rutenbar",
  title =        "Logic simulation on massively parallel architectures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "3",
  pages =        "336--343",
  month =        jun,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:56 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Fukazawa:1989:RRP,
  author =       "T. Fukazawa and T. Kimura and M. Tomizawa and K.
                 Takeda and Y. Itoh",
  title =        "{R256}: a research parallel processor for scientific
                 computation",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "3",
  pages =        "344--351",
  month =        jun,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:56 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Anido:1989:TPT,
  author =       "M. L. Anido and D. J. Allerton and E. J. Zaluska",
  title =        "A three-port\slash three-access register file for
                 concurrent processing and {I/O} communication in a
                 {RISC}-like graphics engine",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "3",
  pages =        "354--361",
  month =        jun,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:56 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Mulder:1989:AFA,
  author =       "J. M. Mulder and R. J. Portier and A. Srivastava and
                 R. in't Velt",
  title =        "An architecture framework for application-specific and
                 scalable architectures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "3",
  pages =        "362--369",
  month =        jun,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:56 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kim:1989:PLS,
  author =       "K. Kim and V. K. Prasanna-Kumar",
  title =        "Perfect {Latin} squares and parallel array access",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "3",
  pages =        "372--379",
  month =        jun,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:56 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Weiss:1989:ASS,
  author =       "S. Weiss",
  title =        "An aperiodic storage scheme to reduce memory conflicts
                 in vector processors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "3",
  pages =        "380--386",
  month =        jun,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:56 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Chen:1989:AVA,
  author =       "C.-L. Chen and C.-K. Liao",
  title =        "Analysis of vector access performance on skewed
                 interleaved memory",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "3",
  pages =        "387--394",
  month =        jun,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:56 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Agarwal:1989:ABS,
  author =       "A. Agarwal and M. Cherian",
  title =        "Adaptive backoff synchronization techniques",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "3",
  pages =        "396--406",
  month =        jun,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:56 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Stenstrom:1989:CCP,
  author =       "P. Stenstr{\"o}m",
  title =        "A cache consistency protocol for multiprocessors with
                 multistage networks",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "3",
  pages =        "407--415",
  month =        jun,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:56 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Su:1989:DSM,
  author =       "H.-M. Su and P.-C. Yew",
  title =        "On data synchronization for multiprocessors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "3",
  pages =        "416--423",
  month =        jun,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:56 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{vanTilborg:1989:PFD,
  author =       "A. M. van Tilborg",
  title =        "Panel on future directions in parallel computer
                 architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "4",
  pages =        "3--53",
  month =        jun,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:08 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Gunther:1989:PBS,
  author =       "N. J. Gunther and M. T. Noga",
  title =        "{ParcBench}: a benchmark for shared-memory
                 architectures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "4",
  pages =        "54--61",
  month =        jun,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:08 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Elkateeb:1989:PSR,
  author =       "A. Elkateeb and T. Le-Ngoc",
  title =        "A priority strategy on {RISC} for real-time
                 multitasking software applications",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "4",
  pages =        "62--68",
  month =        jun,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:08 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Oyang:1989:MCA,
  author =       "Y.-J. Oyang",
  title =        "A multiprocessor configuration in accordance with the
                 aspects of physical and systems design",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "4",
  pages =        "69--73",
  month =        jun,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:08 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Seebauer:1989:MCEa,
  author =       "H. Seebauer",
  title =        "A memory controller executing segment operations in
                 time {$ O(1) $}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "4",
  pages =        "74--81",
  month =        jun,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:08 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Schwartz:1989:DDD,
  author =       "R. J. Schwartz",
  title =        "The design and development of a dynamic program
                 behavior measurement tool for the {Intel 8086\slash
                 88}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "4",
  pages =        "82--94",
  month =        jun,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:08 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Martin:1989:FAM,
  author =       "A. J. Martin and S. M. Burns and T. K. Lee and D.
                 Borkovic and P. J. Hazewindus",
  title =        "The first asynchronous microprocessor: the test
                 results",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "4",
  pages =        "95--110",
  month =        jun,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:08 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Cornett:1989:UMS,
  author =       "F. Cornett",
  title =        "The {UT1000} microprogramming simulator: an
                 educational tool",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "4",
  pages =        "111--118",
  month =        jun,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:08 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Yuen:1989:BDD,
  author =       "C. K. Yuen and W. F. Wong",
  title =        "A bidirectional data driven {Lisp} engine for the
                 direct execution of {Lisp} in parallel",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "4",
  pages =        "119--130",
  month =        jun,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:08 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Smotherman:1989:SBT,
  author =       "M. Smotherman",
  title =        "A sequencing-based taxonomy of {I/O} systems and
                 review of historical machines",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "5",
  pages =        "5--15",
  month =        sep,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:26 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Cousins:1989:DCR,
  author =       "R. Cousins",
  title =        "{DMA} considerations on {RISC} workstations",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "5",
  pages =        "16--23",
  month =        sep,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:26 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Katz:1989:PHP,
  author =       "R. H. Katz",
  title =        "A project on high performance {I/O} subsystems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "5",
  pages =        "24--31",
  month =        sep,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:26 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Dibble:1989:BSB,
  author =       "P. C. Dibble and M. L. Scott",
  title =        "Beyond striping: the bridge multiprocessor file
                 system",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "5",
  pages =        "32--39",
  month =        sep,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:26 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Reddy:1989:SPD,
  author =       "A. L. N. Reddy and P. Banerjee",
  title =        "A study parallel disk organizations",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "5",
  pages =        "40--47",
  month =        sep,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:26 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Smith:1989:MRT,
  author =       "J. M. Smith and G. Q. {Maguire, Jr.}",
  title =        "Measured response times for page-sized fetches on a
                 network",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "5",
  pages =        "48--54",
  month =        sep,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:26 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Wolman:1989:ISI,
  author =       "B. Wolman and T. M. Olson",
  title =        "{IOBENCH}: a system independent {IO} benchmark",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "5",
  pages =        "55--70",
  month =        sep,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:26 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Oslon:1989:DAP,
  author =       "T. M. Oslon",
  title =        "Disk array performance in a random {IO} environment",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "5",
  pages =        "71--77",
  month =        sep,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:26 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Wolman:1989:ASB,
  author =       "B. L. Wolman",
  title =        "An analysis of server-based locking",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "5",
  pages =        "78--82",
  month =        sep,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:26 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Debaere:1989:IPC,
  author =       "E. H. Debaere",
  title =        "Instruction-path coprocessing to solve some {RISC}
                 problems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "5",
  pages =        "83--94",
  month =        sep,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:26 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Seebauer:1989:MCEb,
  author =       "H. Seebauer",
  title =        "A memory controller executing segment operations in
                 time {$ O(1) $}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "5",
  pages =        "95--102",
  month =        sep,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:26 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Chiu:1989:RLF,
  author =       "P. K. Chiu",
  title =        "Representation of logic functions by {\tt if--then}
                 clauses",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "5",
  pages =        "103--107",
  month =        sep,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:26 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Baleanu:1989:ECC,
  author =       "C. Baleanu and D. Tomescu",
  title =        "Embedding computers in a cellular array",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "5",
  pages =        "108--115",
  month =        sep,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:26 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Lass:1989:HES,
  author =       "S. Lass",
  title =        "On hardware enhanced 80386 software emulation,
                 compiled emulation, a program distribution language,
                 and pack computers",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "5",
  pages =        "116--118",
  month =        sep,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:26 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Litaize:1989:MSM,
  author =       "Daniel Litaize and Omar Hammami and Mustapha Lalam and
                 Adelaziz Mzoughi and Pascl Sinrat",
  title =        "Multiprocessors with a serial multiport memory and a
                 pseudo crossbar of serial links used as a
                 processor-memory switch",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "6",
  pages =        "8--21",
  month =        dec,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:27 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Fritsch:1989:DSM,
  author =       "G. Fritsch and W. Henning and H. Hesenuer and R. Klar
                 and C. U. Linster and C. w. Oehlrich and P. Schlenk and
                 J. Vokert",
  title =        "Distributed shared memory multiprocessor architecture
                 {MEMSY} for high performance parallel computations",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "6",
  pages =        "22--35",
  month =        dec,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:27 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Mendelson:1989:SCC,
  author =       "A. Mendelson and D. K. Pradhan and A. D. Singh",
  title =        "A single cached copy data coherence scheme for
                 multiprocessor systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "6",
  pages =        "36--49",
  month =        dec,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:27 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Feitelson:1989:AMU,
  author =       "Dror G. Feitelson and Larry Rudolph",
  title =        "Architecture for a multi-user general-purpose parallel
                 system",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "6",
  pages =        "50--56",
  month =        dec,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:27 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Quammen:1989:RWA,
  author =       "D. Quammen and D. R. Miller and D. Tabak",
  title =        "Register window architecture for multitasking
                 applications",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "6",
  pages =        "57--66",
  month =        dec,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:27 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Rosenberg:1989:EEI,
  author =       "Arnold Rosenberg",
  title =        "Efficient emulations of interconnection networks",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "6",
  pages =        "67--79",
  month =        dec,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:27 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Scherson:1989:DPC,
  author =       "Isaac D. Scherson and Peter F. Corbett",
  title =        "Description and performance of a class of orthogonal
                 multiprocessor networks",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "6",
  pages =        "80--90",
  month =        dec,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:27 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{David:1989:EIB,
  author =       "Llana David and Ran Ginosar and Michael Yoeli",
  title =        "An efficient implementation of {Boolean} functions and
                 finite state machine as self-timed circuit",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "6",
  pages =        "91--104",
  month =        dec,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:27 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Dollan:1989:CSP,
  author =       "Apostolos Dollan and Robert F. Krick",
  title =        "The case for the sustained performance computer
                 architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "6",
  pages =        "129--136",
  month =        dec,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:27 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Johnson:1989:WSP,
  author =       "Eric E. Johnson",
  title =        "Working set prefetching for cache memories",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "6",
  pages =        "137--141",
  month =        dec,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:27 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Lee:1989:MPC,
  author =       "K. e H. Lee and C. H. Lam",
  title =        "Massage-passing controller for a shared-memory
                 multiprocessor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "6",
  pages =        "142--149",
  month =        dec,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:27 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hsu:1989:LCF,
  author =       "Tsong-Chih Hsu and Ling-Yang Kung",
  title =        "Logic and conflict-free vector addresses",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "6",
  pages =        "150--153",
  month =        dec,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:27 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hsu:1989:AGU,
  author =       "Tsong-Chih Hsu and Ling-Yang Kung",
  title =        "An address generation unit for array accessing",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "6",
  pages =        "154--160",
  month =        dec,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:27 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hsu:1989:HMP,
  author =       "Tsong-Chih Hsu and Ling-Yang Kung",
  title =        "A hardware mechanism for priority queue",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "17",
  number =       "6",
  pages =        "162--169",
  month =        dec,
  year =         "1989",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:27 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Dvorak:1990:MAS,
  author =       "V. Dvorak",
  title =        "Microsequencer architecture supporting arbitrary
                 branching up to 2m targets",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "1",
  pages =        "9--9",
  month =        mar,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:32 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Dongarra:1990:PVC,
  author =       "Jack J. Dongarra",
  title =        "Performance of various computers using standard linear
                 equations software",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "1",
  pages =        "17--17",
  month =        mar,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:32 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hsu:1990:CFO,
  author =       "Tsong---Chih Hsu and Ling---Yang Kung",
  title =        "A comment on {``A Fetch-and-Op Implementation for
                 Parallel Computers''}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "1",
  pages =        "32--32",
  month =        mar,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:32 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Cousins:1990:NAC,
  author =       "Robert Cousins",
  title =        "A novel approach to character interfaces",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "1",
  pages =        "35--35",
  month =        mar,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:32 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Cousins:1990:RPI,
  author =       "Robert Cousins",
  title =        "A reentrant peripheral interface",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "1",
  pages =        "43--43",
  month =        mar,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:32 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Anderson:1990:ACS,
  author =       "Noel W. Anderson",
  title =        "Amorphous computer system architecture: a preliminary
                 look",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "1",
  pages =        "51--51",
  month =        mar,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:32 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Oyang:1990:CEA,
  author =       "Yen-Jen Oyang and Bor-Ting Chang and Shu-May Lin",
  title =        "A cost-effective approach to implement a long
                 instruction word microprocessor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "1",
  pages =        "59--59",
  month =        mar,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:32 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Fritsch:1990:PBA,
  author =       "C. Fritsch and T. S{\'a}nchez and J. Anaya",
  title =        "Primitive based architectures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "1",
  pages =        "73--73",
  month =        mar,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:32 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Lorin:1990:MRC,
  author =       "Harold Lorin",
  title =        "A model for recentralization of computing:
                 (distributed processing comes home)",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "1",
  pages =        "81--81",
  month =        mar,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:32 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Teodosiu:1990:CTD,
  author =       "Dan Teodosiu",
  title =        "Computing in three dimensions",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "1",
  pages =        "99--99",
  month =        mar,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:32 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Frazier:1990:ASM,
  author =       "Gary Frazier",
  title =        "{Ariel}: a scalable multiprocessor for the simulation
                 of neural networks",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "1",
  pages =        "107--107",
  month =        mar,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:32 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Colwell:1990:BRH,
  author =       "Robert P. Colwell",
  title =        "Book review: {{\em High-Level Language Computer
                 Architecture\/}} edited by {Veljko Milutinovic
                 (Computer Science Press, 1989)}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "1",
  pages =        "120--122",
  month =        mar,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:32 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Parhami:1990:BRA,
  author =       "Behrooz Parhami",
  title =        "Book review: {{\em Advanced Research in VLSI}}, edited
                 by {Charles L. Seitz (The MIT Press, Cambridge, MA,
                 1989, 373 pp.)}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "1",
  pages =        "122--123",
  month =        mar,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:32 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Matthes:1990:HRG,
  author =       "Wolfgang Matthes",
  title =        "Hardware {Resources}: a generalizing view on computer
                 architectures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "2",
  pages =        "7--14",
  month =        jun,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:46 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Rauchwerger:1990:MFP,
  author =       "Lawrence Rauchwerger and Michael P. Farmwald",
  title =        "A multiple floating point coprocessor architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "2",
  pages =        "15--24",
  month =        jun,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:46 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Glew:1990:SCT,
  author =       "Andy Glew and Wen-Mei Hwu",
  title =        "Snoopy cache test-and-test-and-set without excessive
                 bus contention",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "2",
  pages =        "25--32",
  month =        jun,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:46 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Higbee:1990:QEC,
  author =       "Lee Higbee",
  title =        "Quick and easy cache performance analysis",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "2",
  pages =        "33--44",
  month =        jun,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:46 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Park:1990:ISF,
  author =       "Arvin Park and Jeffrey C. Becker and Richard J.
                 Lipton",
  title =        "{IOStone}: a synthetic file system benchmark",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "2",
  pages =        "45--52",
  month =        jun,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:46 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Pnevmatikatos:1990:CPI,
  author =       "Dionisios N. Pnevmatikatos and Mark D. Hill",
  title =        "Cache performance of the integer {SPEC} benchmarks on
                 a {RISC}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "2",
  pages =        "53--68",
  month =        jun,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:46 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ruighaver:1990:MND,
  author =       "A. B. Ruighaver",
  title =        "A modular network for dense optical interconnection of
                 processing elements",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "2",
  pages =        "69--75",
  month =        jun,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:46 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{DeGloria:1990:VVI,
  author =       "Alessandro {De Gloria}",
  title =        "{VISA}: a variable instruction set architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "2",
  pages =        "76--84",
  month =        jun,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:46 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Williams:1990:ADR,
  author =       "Fleur L. Williams and Gordon B. Steven",
  title =        "Address and data register separation on the {M68000}
                 family",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "2",
  pages =        "85--89",
  month =        jun,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:46 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Adve:1990:WON,
  author =       "Sarita V. Adve and Mark D. Hill",
  title =        "Weak ordering---a new definition",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "3a",
  pages =        "2--14",
  month =        jun,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:03 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Gharachorloo:1990:MCE,
  author =       "Kourosh Gharachorloo and Daniel Lenoski and James
                 Laudon and Phillip Gibbons and Anoop Gupta and John
                 Hennessy",
  title =        "Memory consistency and event ordering in scalable
                 shared-memory multiprocessors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "3a",
  pages =        "15--26",
  month =        jun,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:03 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Lee:1990:SMC,
  author =       "Joonwon Lee and Umakishore Ramachandran",
  title =        "Synchronization with multiprocessor caches",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "3a",
  pages =        "27--37",
  month =        jun,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:03 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Chuang:1990:DPA,
  author =       "Po-Jen Chuang and Nian-Feng Tzeng",
  title =        "Dynamic processor allocation in hypercube computers",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "3a",
  pages =        "40--49",
  month =        jun,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:03 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Youssef:1990:NAF,
  author =       "Abdou Youssef and Bruce Arden",
  title =        "A new approach to fast control of $ r_2 \times r_2 $
                 $3$-stage {Benes} networks of $ r \times r$ crossbar
                 switches",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "3a",
  pages =        "50--59",
  month =        jun,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:03 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Dally:1990:VCF,
  author =       "William J. Dally",
  title =        "Virtual-channel flow control",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "3a",
  pages =        "60--68",
  month =        jun,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:03 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Borkar:1990:SSM,
  author =       "Shekhar Borkar and Robert Cohn and George Cox and
                 Thomas Gross and H. T. Kung and Monica Lam and Margie
                 Levine and Brian Moore and Wire Moore and Craig
                 Peterson and Jim Susman and Jim Sutton and John
                 Urbanski and Jon Webb",
  title =        "Supporting systolic and memory communication in
                 {iWarp}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "3a",
  pages =        "70--81",
  month =        jun,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:03 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Papadopoulos:1990:MET,
  author =       "Gregory M. Papadopoulos and David E. Culler",
  title =        "{Monsoon}: an explicit token-store architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "3a",
  pages =        "82--91",
  month =        jun,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:03 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Annaratone:1990:KPP,
  author =       "Marco Annaratone and Marco Fillo and Kiyoshi
                 Nakabayashi and Marc Viredaz",
  title =        "The {K2} parallel processor: architecture and hardware
                 implementation",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "3a",
  pages =        "92--101",
  month =        jun,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:03 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Agarwal:1990:APA,
  author =       "Anant Agarwal and Beng-Hong Lim and David Kranz and
                 John Kubiatowicz",
  title =        "{APRIL}: a processor architecture for
                 multiprocessing",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "3a",
  pages =        "104--114",
  month =        jun,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:03 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Bisiani:1990:PDS,
  author =       "Roberto Bisiani and Mosur Ravishankar",
  title =        "{PLUS}: a distributed shared-memory system",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "3a",
  pages =        "115--124",
  month =        jun,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:03 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Bennett:1990:ASC,
  author =       "John K. Bennett and John B. Carter and Willy
                 Zwaenepoel",
  title =        "Adaptive software cache management for distributed
                 shared memory architectures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "3a",
  pages =        "125--134",
  month =        jun,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:03 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ditzel:1990:BSV,
  author =       "David R. Ditzel and John L. Hennessy and Bernie Rudin
                 and Alan Jay Smith and Stephen L. Squires and Zeke
                 Zalcstein",
  title =        "Big science versus little science---do you have to
                 build it? (panel session)",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "3a",
  pages =        "136--136",
  month =        jun,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:03 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{OKrafka:1990:EET,
  author =       "Brian W. O'Krafka and A. Richard Newton",
  title =        "An empirical evaluation of two memory-efficient
                 directory methods",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "3a",
  pages =        "138--147",
  month =        jun,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:03 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Lenoski:1990:DBC,
  author =       "Daniel Lenoski and James Laudon and Kourosh
                 Gharachorloo and Anoop Gupta and John Hennessy",
  title =        "The directory-based cache coherence protocol for the
                 {DASH} multiprocessor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "3a",
  pages =        "148--159",
  month =        jun,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:03 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Przybylski:1990:PIB,
  author =       "Steven Przybylski",
  title =        "The performance impact of block sizes and fetch
                 strategies",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "3a",
  pages =        "160--169",
  month =        jun,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:03 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Alpert:1990:PCL,
  author =       "D. Alpert and A. Averbuch and O. Danieli",
  title =        "Performance comparison of load\slash store and
                 symmetric instruction set architectures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "3a",
  pages =        "172--181",
  month =        jun,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:03 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Davidson:1990:RCB,
  author =       "Jack W. Davidson and David B. Whalley",
  title =        "Reducing the cost of branches by using registers",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "3a",
  pages =        "182--191",
  month =        jun,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:03 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Love:1990:ISV,
  author =       "Carl E. Love and Harry F. Jordan",
  title =        "An investigation of static versus dynamic scheduling",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "3a",
  pages =        "192--201",
  month =        jun,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:03 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Bhandarkar:1990:VVA,
  author =       "Dileep Bhandarkar and Richard Brunner",
  title =        "{VAX} vector architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "3a",
  pages =        "204--215",
  month =        jun,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:03 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Horst:1990:MII,
  author =       "Robert W. Horst and Richard L. Harris and Robert L.
                 Jardine",
  title =        "Multiple instruction issue in the {NonStop Cyclone}
                 processor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "3a",
  pages =        "216--226",
  month =        jun,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:03 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Thakkar:1990:POA,
  author =       "Shreekant S. Thakkar and Mark Sweiger",
  title =        "Performance of an {OLTP} application on symmetry
                 multiprocessor system",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "3a",
  pages =        "228--238",
  month =        jun,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:03 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Chen:1990:ISG,
  author =       "Ding-Kai Chen and Hong-Men Su and Pen-Chung Yew",
  title =        "The impact of synchronization and granularity on
                 parallel systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "3a",
  pages =        "239--248",
  month =        jun,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:03 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Bugge:1990:TDS,
  author =       "H{\aa}kon O. Bugge and Ernst H. Kristiansen and
                 Bj{\o}rn O. Bakka",
  title =        "Trace-driven simulations for a two-level cache design
                 in open bus systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "3a",
  pages =        "250--259",
  month =        jun,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:03 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hsu:1990:PMT,
  author =       "Jiun-Ming Hsu and Prithviraj Banerjee",
  title =        "Performance measurement and trace driven simulation of
                 parallel {CAD} and numeric applications on a hypercube
                 multicomputer",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "3a",
  pages =        "260--269",
  month =        jun,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:03 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Borg:1990:GAV,
  author =       "Anita Borg and R. E. Kessler and David W. Wall",
  title =        "Generation and analysis of very long address traces",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "3a",
  pages =        "270--279",
  month =        jun,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:03 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Holmer:1990:FPE,
  author =       "Bruce K. Holmer and Barton Sano and Michael Carlton
                 and Peter {Van Roy} and Ralph Haygood and William R. Bush
                 and Alvin M. Despain and Joan M. Pendleton and Tep
                 Dobry",
  title =        "Fast {Prolog} with an extended general purpose
                 architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "3a",
  pages =        "282--291",
  month =        jun,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:03 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Alkalaj:1990:ASM,
  author =       "Leon Alkalaj and Tom{\'a}s Lang and Milo{\v{s}}
                 Ercegovac",
  title =        "Architectural support for the management of
                 tightly-coupled fine-grain goals in flat concurrent
                 {Prolog}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "3a",
  pages =        "292--301",
  month =        jun,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:03 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ho:1990:BAD,
  author =       "Samuel Ho and Lawrence Snyder",
  title =        "Balance in architectural design",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "3a",
  pages =        "302--310",
  month =        jun,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:03 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Reddy:1990:SBP,
  author =       "A. L. Narasimha Reddy and Prithviraj Banerjee",
  title =        "A study of {I/O} behavior of perfect benchmarks on a
                 multiprocessor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "3a",
  pages =        "312--321",
  month =        jun,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:03 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Chen:1990:MPS,
  author =       "Peter M. Chen and David A. Patterson",
  title =        "Maximizing performance in a striped disk array",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "3a",
  pages =        "322--331",
  month =        jun,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:03 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Shin:1990:DAH,
  author =       "Kang G. Shin and Greg Dykema",
  title =        "A distributed {I/O} architecture for {HARTS}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "3a",
  pages =        "332--342",
  month =        jun,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:03 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Smith:1990:BBS,
  author =       "Michael D. Smith and Monica S. Lam and Mark A.
                 Horowitz",
  title =        "Boosting beyond static scheduling in a superscalar
                 processor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "3a",
  pages =        "344--354",
  month =        jun,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:03 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Taylor:1990:TSL,
  author =       "George Taylor and Peter Davies and Michael Farmwald",
  title =        "The {TLB} slice---a low-cost high-speed address
                 translation mechanism",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "3a",
  pages =        "355--363",
  month =        jun,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:03 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Jouppi:1990:IDM,
  author =       "Norman P. Jouppi",
  title =        "Improving direct-mapped cache performance by the
                 addition of a small fully-associative cache and
                 prefetch buffers",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "3a",
  pages =        "364--373",
  month =        jun,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:03 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Davidson:1990:BTO,
  author =       "Edward S. Davidson and Gurindar S. Sohl and Joseph A.
                 Fisher and Greg Grohoski and Yale Pratt and J. E. Smith
                 and David R. Stiles",
  title =        "Better than one operation per clock (panel): vectors,
                 {VLIW}, and superscalar",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "3a",
  pages =        "376--376",
  month =        jun,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:03 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Alverson:1990:TCS,
  author =       "Robert Alverson and David Callahan and Daniel Cummings
                 and Brian Koblenz and Allan Porterfield and Burton
                 Smith",
  title =        "The {Tera} computer system",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "3b",
  pages =        "1--6",
  month =        sep,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:03 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hwang:1990:ORB,
  author =       "K. Hwang and M. Dubois and D. K. Panda and S. Rao and
                 S. Shang and A. Uresin and W. Mao and H. Nair and M.
                 Lytwyn and F. Hsieh and J. Liu and S. Mehrotra and C.
                 M. Cheng",
  title =        "{OMP}: a {RISC}-based multiprocessor using
                 orthogonal-access memories and multiple spanning
                 buses",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "3b",
  pages =        "7--22",
  month =        sep,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:03 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Dai:1990:BAS,
  author =       "Kechang Dai and Wolfgang K. Giloi",
  title =        "A basic architecture supporting {LGDG} computation",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "3b",
  pages =        "23--33",
  month =        sep,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:03 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Min:1990:ECS,
  author =       "Sang Lyul Min and Jean-Loup Baer and Hyoung-Joo Kim",
  title =        "An efficient caching support for critical sections in
                 large-scale shared-memory multiprocessors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "3b",
  pages =        "34--47",
  month =        sep,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:03 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Nagashima:1990:IFA,
  author =       "Umpei Nagashima and Fumio Nishimoto and Takashi
                 Shibata and Hiroshi Itoh and Minoru Gotoh",
  title =        "An improvement of {I/O} function for auxiliary
                 storage: parallel {I/O} for a large scale
                 supercomputing",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "3b",
  pages =        "48--59",
  month =        sep,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:03 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Tzeng:1990:AVH,
  author =       "Nian-Feng Tzeng",
  title =        "Analysis of a variant hypercube topology",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "3b",
  pages =        "60--70",
  month =        sep,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:03 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{vanderHouwen:1990:POS,
  author =       "P. J. van der Houwen and B. P. Sommeijer",
  title =        "Parallel {ODE} solvers",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "3b",
  pages =        "71--81",
  month =        sep,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:03 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Dayde:1990:UPL,
  author =       "M. J. Dayd{\'e} and I. S. Duff",
  title =        "Use of parallel level 3 {BLAS} in {LU} factorization
                 on three vector multiprocessors the {ALLIANT FX/80},
                 the {CRAY-2}, and the {IBM 3090 VF}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "3b",
  pages =        "82--95",
  month =        sep,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:03 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Houstis:1990:ENS,
  author =       "E. N. Houstis and J. R. Rice and N. P. Chrisochoides
                 and H. C. Karathanasis and P. N. Papachiou and M. K.
                 Samartzis and E. A. Vavalis and Ko Yang Wang and S.
                 Weerawarana",
  title =        "{//ELLPACK}: a numerical simulation programming
                 environment for parallel {MIMD} machines",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "3b",
  pages =        "96--107",
  month =        sep,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:03 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Christara:1990:SCP,
  author =       "Christina C. Christara",
  title =        "{Schur} complement preconditioned conjugate gradient
                 methods for spline collocation equations",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "3b",
  pages =        "108--120",
  month =        sep,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:03 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Chung:1990:COP,
  author =       "Kuo-Liang Chung and Ferng-Ching Lin and Wen-Chin
                 Chen",
  title =        "Cost-optimal parallel {B}-spline interpolations",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "3b",
  pages =        "121--131",
  month =        sep,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:03 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Gallivan:1990:SGS,
  author =       "K. Gallivan and A. Sameh and Z. Zlatev",
  title =        "Solving general sparse linear systems using conjugate
                 gradient-type methods",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "3b",
  pages =        "132--139",
  month =        sep,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:03 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Yuba:1990:DCD,
  author =       "Toshitsugu Yuba and Toshio Shimada and Yoshinori
                 Yamaguchi and Kei Hiraki and Shuichi Sakai",
  title =        "Dataflow computer development in {Japan}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "3b",
  pages =        "140--147",
  month =        sep,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:03 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Sarkar:1990:PPO,
  author =       "Vivek Sarkar and David Cann",
  title =        "{POSC}---a partitioning and optimizing {SISAL}
                 compiler",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "3b",
  pages =        "148--164",
  month =        sep,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:03 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Bodin:1990:LOH,
  author =       "Fran{\c{c}}ois Bodin and Fran{\c{c}}ois Charot",
  title =        "Loop optimization for horizontal microcoded machines",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "3b",
  pages =        "164--176",
  month =        sep,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:03 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Tang:1990:CTD,
  author =       "Peiyi Tang and Pen-Chung Yew and Chuan-Qi Zhu",
  title =        "Compiler techniques for data synchronization in nested
                 parallel loops",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "3b",
  pages =        "177--186",
  month =        sep,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:03 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hudak:1990:CTD,
  author =       "David E. Hudak and Santosh G. Abraham",
  title =        "Compiler techniques for data partitioning of
                 sequentially iterated parallel loops",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "3b",
  pages =        "187--200",
  month =        sep,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:03 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Klappholz:1990:PAA,
  author =       "David Klappholz and Kleanthis Psarris and Xiangyun
                 Kong",
  title =        "On the perfect accuracy of an approximate subscript
                 analysis test",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "3b",
  pages =        "201--212",
  month =        sep,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:03 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Malony:1990:HBP,
  author =       "Allen D. Malony and Daniel A. Reed",
  title =        "A hardware-based performance monitor for the {Intel
                 iPSC/2} hypercube",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "3b",
  pages =        "213--226",
  month =        sep,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:03 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Dimpsey:1990:PDD,
  author =       "R. T. Dimpsey and R. K. Iyer",
  title =        "Performance degradation due to multiprogramming and
                 system overheads in real workloads: case study on a
                 shared memory multiprocessor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "3b",
  pages =        "227--238",
  month =        sep,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:03 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Saad:1990:SBP,
  author =       "Youcef Saad and Harry A. G. Wijshoff",
  title =        "{SPARK}: a benchmark package for sparse computations",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "3b",
  pages =        "239--253",
  month =        sep,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:03 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Cybenko:1990:SPE,
  author =       "George Cybenko and Lyle Kipp and Lynn Pointer and
                 David Kuck",
  title =        "Supercomputer performance evaluation and the {Perfect
                 Benchmarks}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "3b",
  pages =        "254--266",
  month =        sep,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:03 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Noor:1990:SLS,
  author =       "Ahmed K. Noor and Jeanne M. Peters",
  title =        "Strategies for large-scale structural problems on
                 high-performance computers",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "3b",
  pages =        "267--280",
  month =        sep,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:03 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Zecca:1990:ECV,
  author =       "V. Zecca and A. Kamel",
  title =        "Elastodynamics on clustered vector multiprocessors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "3b",
  pages =        "281--290",
  month =        sep,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:03 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Eijkhout:1990:IPP,
  author =       "Victor Eijkhout",
  title =        "Implementation of $5$-point\slash $9$-point
                 multi-level methods on hypercube architectures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "3b",
  pages =        "291--295",
  month =        sep,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:03 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Chen:1990:SBV,
  author =       "Philip C. Chen",
  title =        "Supercomputer-based visualization systems used for
                 analyzing output data of a numerical weather prediction
                 model",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "3b",
  pages =        "296--309",
  month =        sep,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:03 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Takahashi:1990:PAW,
  author =       "Yoshizo Takahashi and Shigetaka Sasaki",
  title =        "Parallel automated wire-routing with a number of
                 competing processors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "3b",
  pages =        "310--317",
  month =        sep,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:03 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Chan:1990:HAA,
  author =       "Tony F. Chan",
  title =        "Hierarchical algorithms and architectures for parallel
                 scientific computing",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "3b",
  pages =        "318--329",
  month =        sep,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:03 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Smith:1990:IDA,
  author =       "Kevin Smith and Bill Appelbe and Kurt Stirewalt",
  title =        "Incremental dependence analysis for interactive
                 parallelization",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "3b",
  pages =        "330--341",
  month =        sep,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:03 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ruhl:1990:PFC,
  author =       "Roland R{\"u}hl and Marco Annaratone",
  title =        "Parallelization of {FORTRAN} code on
                 distributed-memory parallel processors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "3b",
  pages =        "342--353",
  month =        sep,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:03 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Gornish:1990:CDD,
  author =       "Edward H. Gornish and Elana D. Granston and Alexander
                 V. Veidenbaum",
  title =        "Compiler-directed data prefetching in multiprocessors
                 with memory hierarchies",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "3b",
  pages =        "354--368",
  month =        sep,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:03 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Gao:1990:TEF,
  author =       "Guang R. Gao and Herbert H. J. Hum and Yue-Bong Wong",
  title =        "Towards efficient fine-grain software pipelining",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "3b",
  pages =        "369--379",
  month =        sep,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:03 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Andre:1990:PSM,
  author =       "Fran{\c{c}}oise Andr{\'e} and Jean-Louis Pazat and
                 Henry Thomas",
  title =        "{Pandore}: a system to manage data distribution",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "3b",
  pages =        "380--388",
  month =        sep,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:03 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Fatoohi:1990:VPA,
  author =       "Rod A. Fatoohi",
  title =        "Vector performance analysis of the {NEC SX-2}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "3b",
  pages =        "389--400",
  month =        sep,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:03 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Bodin:1990:PEP,
  author =       "Fran{\c{c}}ois Bodin and Daniel Windheiser and William
                 Jalby and Daya Atapattu and Mannho Lee and Dennis
                 Gannon",
  title =        "Performance evaluation and prediction for parallel
                 algorithms on the {BBN GP1000}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "3b",
  pages =        "401--413",
  month =        sep,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:03 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Brochard:1990:DAH,
  author =       "Luigi Brochard and Alex Freau",
  title =        "Designing algorithms on hierarchical memory
                 multiprocessors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "3b",
  pages =        "414--427",
  month =        sep,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:03 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Bucher:1990:ACM,
  author =       "Ingrid Y. Bucher and Donald A. Calahan",
  title =        "Access conflicts in multiprocessor memories queueing
                 models and simulation studies",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "3b",
  pages =        "428--438",
  month =        sep,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:03 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Luque:1990:ITD,
  author =       "Emilio Luque and Ana Ripoll and Porfidio Hern{\'a}ndez
                 and Tom{\'a}s Margalef",
  title =        "Impact of task duplication on static-scheduling
                 performance in multiprocessor systems with variable
                 execution-time tasks",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "3b",
  pages =        "439--446",
  month =        sep,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:03 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Gerasoulis:1990:CTG,
  author =       "Apostolos Gerasoulis and Sesh Venugopal and Tao Yang",
  title =        "Clustering task graphs for message passing
                 architectures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "3b",
  pages =        "447--456",
  month =        sep,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:03 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Paalvast:1990:MPP,
  author =       "Edwin M. Paalvast and Arjan J. van Gemund and Henk J.
                 Sips",
  title =        "A method for parallel program generation with an
                 application to the {Booster} language",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "3b",
  pages =        "457--469",
  month =        sep,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:03 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Tsoukarellas:1990:RTS,
  author =       "M. A. Tsoukarellas and T. S. Papatheodorou",
  title =        "A run time support system for multiprocessor
                 machines",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "3b",
  pages =        "470--478",
  month =        sep,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:03 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hey:1990:STP,
  author =       "Anthony J. G. Hey",
  title =        "Supercomputing with transputers---past, present and
                 future",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "3b",
  pages =        "479--489",
  month =        sep,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:03 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Smith:1990:EA,
  author =       "Burton Smith",
  title =        "The end of architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "4",
  pages =        "10--17",
  month =        dec,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:05 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hill:1990:WS,
  author =       "Mark D. Hill",
  title =        "What is scalability?",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "4",
  pages =        "18--21",
  month =        dec,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:05 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Laplante:1990:NSI,
  author =       "P. A. Laplante",
  title =        "A novel single instruction computer architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "4",
  pages =        "22--26",
  month =        dec,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:05 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ginosar:1990:PAP,
  author =       "Ran Ginosar and Nick Michell",
  title =        "On the potential of asynchronous pipelined
                 processors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "4",
  pages =        "27--34",
  month =        dec,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:05 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Oyang:1990:EEA,
  author =       "Yen-Jen Oyang and Chun-Hung Wen and Yu-Fen Chen and
                 Shu-May Lin",
  title =        "The effect of employing advanced branching mechanisms
                 in superscalar processors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "4",
  pages =        "35--52",
  month =        dec,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:05 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Deville:1990:LCU,
  author =       "Yannick Deville",
  title =        "A low-cost usage-based replacement algorithm for cache
                 memories",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "4",
  pages =        "52--58",
  month =        dec,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:05 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Gunther:1990:HSM,
  author =       "Bernard K. Gunther",
  title =        "A high speed mechanism for short branches",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "4",
  pages =        "59--61",
  month =        dec,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:05 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{McLaughlin:1990:DFD,
  author =       "Robert McLaughlin",
  title =        "Design for fast {DSP} machine",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "4",
  pages =        "62--66",
  month =        dec,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:05 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Joerg:1990:SPN,
  author =       "Werner B. Joerg",
  title =        "A subclass of {Petri Nets} as design abstraction for
                 parallel architectures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "4",
  pages =        "67--77",
  month =        dec,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:05 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Thorson:1990:UN,
  author =       "Mark Thorson",
  title =        "{Usenet} Nuggets",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "4",
  pages =        "80--89",
  month =        dec,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:05 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Langdon:1990:BRH,
  author =       "Glen G. {Langdon, Jr.}",
  title =        "Book review: {{\em Highly Parallel Computing\/}} by
                 {George Almasi and Allan Gotlieb (Benjamin\slash
                 Cummings, 1989)}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "4",
  pages =        "90--90",
  month =        dec,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:05 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Langdon:1990:BRS,
  author =       "Glen G. {Langdon, Jr.}",
  title =        "Book review: {{\em Solving Problems on Concurrent
                 Processors, Vol II: Software for Concurrent
                 Processors\/}} by {I. Angus, G. Fox, J. Kim, and D.
                 Walker (Prentice-Hall, 1990)}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "4",
  pages =        "90--91",
  month =        dec,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:05 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Dikotter:1990:BRD,
  author =       "Marc Dikotter",
  title =        "Book review: {{\em The Definition of Standard ML\/}}
                 by {R. Milner, M. Torte, R. Harper}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "18",
  number =       "4",
  pages =        "91--91",
  month =        dec,
  year =         "1990",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:05 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Leighton:1991:SPS,
  author =       "F. T. Leighton",
  title =        "Selected Papers from the {Symposium on Parallel
                 Algorithms and Architectures}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "1",
  pages =        "5--5",
  month =        mar,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:33 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ngai:1991:FAR,
  author =       "John Y. Ngai and Charles L. Seitz",
  title =        "A framework for adaptive routing in multicomputer
                 networks",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "1",
  pages =        "6--14",
  month =        mar,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:33 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Beigel:1991:PNI,
  author =       "Richard Beigel and Clydel P. Kruskal",
  title =        "Processor networks and interconnection networks
                 without long wires (extended abstract)",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "1",
  pages =        "15--24",
  month =        mar,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:33 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Annexstein:1991:FTH,
  author =       "Fred Annexstein",
  title =        "Fault tolerance in hypercube-derivative networks
                 (preliminary version)",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "1",
  pages =        "25--34",
  month =        mar,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:33 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Fujimoto:1991:VTM,
  author =       "Richard M. Fujimoto",
  title =        "The {Virtual Time Machine}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "1",
  pages =        "35--44",
  month =        mar,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:33 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Bilardi:1991:OVA,
  author =       "Ginfranco Bilardi and Scot W. Hornick and Majid
                 Sarrafzadeh",
  title =        "Optimal {VLSI} architectures for multidimensional
                 {DFT} (preliminary version)",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "1",
  pages =        "45--52",
  month =        mar,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:33 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Thomborson:1991:SIM,
  author =       "Clark D. Thomborson and Belle W.-Y. Wei",
  title =        "Systolic implementations of a move-to-front text
                 compressor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "1",
  pages =        "53--60",
  month =        mar,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:33 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Knight:1991:TLL,
  author =       "Thomas F. {Knight, Jr.}",
  title =        "Technologies for low latency interconnection
                 switches",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "1",
  pages =        "61--68",
  month =        mar,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:33 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Herbordt:1991:MPA,
  author =       "Martin C. Herbordt and Charles C. Weems and James C.
                 Corbett",
  title =        "Message-passing algorithms for a {SIMD} torus with
                 coteries",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "1",
  pages =        "69--78",
  month =        mar,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:33 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Konstantinidou:1991:CRP,
  author =       "S. Konstantinidou and L. Snyder",
  title =        "The chaos router: a practical application of
                 randomization in network routing",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "1",
  pages =        "79--88",
  month =        mar,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:33 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Bruck:1991:RAE,
  author =       "Jehoshua Bruck and Robert Cypher and Danny Soroker",
  title =        "Running algorithms efficiently on faulty hypercubes
                 (extended abstract)",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "1",
  pages =        "89--96",
  month =        mar,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:33 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Nishimura:1991:ASM,
  author =       "Naomi Nishimura",
  title =        "Asynchronous shared memory parallel computation
                 (preliminary version)",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "1",
  pages =        "97--105",
  month =        mar,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:33 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Shand:1991:HSL,
  author =       "M. Shand and P. Bertin and J. Vuillemin",
  title =        "Hardware speedups in long integer multiplication",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "1",
  pages =        "106--113",
  month =        mar,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:33 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Thapar:1991:CCL,
  author =       "Manu Thapar and Bruce Delagi",
  title =        "Cache coherence for large scale shared memory
                 multiprocessors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "1",
  pages =        "114--119",
  month =        mar,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:33 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Grabienski:1991:FFS,
  author =       "Peter Grabienski",
  title =        "{FLIP-FLOP}: a stack-oriented multiprocessing system",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "1",
  pages =        "120--127",
  month =        mar,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:33 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Price:1991:TAD,
  author =       "Camille C. Price",
  title =        "Task allocation in data flow multiprocessors: an
                 annotated bibliography",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "1",
  pages =        "128--134",
  month =        mar,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:33 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Adams:1991:PPP,
  author =       "Rod Adams and Gordon Steven",
  title =        "A parallel pipelined processor with conditional
                 instruction execution",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "1",
  pages =        "135--142",
  month =        mar,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:33 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Thorson:1991:UNa,
  author =       "Mark Thorson",
  title =        "{Usenet} Nuggets",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "1",
  pages =        "146--150",
  month =        mar,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:33 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hilton:1991:BRS,
  author =       "Michael L. Hilton",
  title =        "Book review: {{\em Systems Programming in Parallel
                 Logic Languages\/}} by {Ian Foster (Prentice Hall,
                 1990)}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "1",
  pages =        "151--151",
  month =        mar,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:33 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Anthony:1991:BRT,
  author =       "Keith Anthony",
  title =        "Book review: {{\em Technology Projection Modeling of
                 Future Computer Systems\/}} by {Al Cutaia
                 (Prentice-Hall, 1990)}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "1",
  pages =        "152--153",
  month =        mar,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:33 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Schneck:1991:BRO,
  author =       "Paul B. Schneck",
  title =        "Book review: {{\em Optimizing FORTRAN Programs\/}} by
                 {C. F. Schofield (Halstead Press, 1989)}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "1",
  pages =        "153--154",
  month =        mar,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:33 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Bernecky:1991:BRMa,
  author =       "Robert Bernecky",
  title =        "Book review: {{\em Multiprocessors\/}} by {Daniel
                 Tabak (Prentice Hall, Englewood Cliffs, NJ)}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "1",
  pages =        "154--156",
  month =        mar,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:33 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Bernecky:1991:BRMb,
  author =       "Robert Bernecky",
  title =        "Book review: {{\em Multiprocessor Performance\/}} by
                 {Erol Gelenbe (J. Wiley \& Sons, Chichester,
                 England)}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "1",
  pages =        "156--157",
  month =        mar,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:33 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Fulcher:1991:BRN,
  author =       "John Fulcher",
  title =        "Book review: {{\em Neural Net Applications and
                 Products\/}} by {Richard K. Miller, Terri C. Walker,
                 and Anne M. Ryan (SEAl Technical Publications, 1990)}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "1",
  pages =        "157--158",
  month =        mar,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:33 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Wolfe:1991:VIS,
  author =       "Andrew Wolfe and John P. Shen",
  title =        "A variable instruction stream extension to the {VLIW}
                 architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "2",
  pages =        "2--14",
  month =        apr,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:40 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Katevenis:1991:RBP,
  author =       "Manolis Katevenis and Nestoras Tzartzanis",
  title =        "Reducing the branch penalty by rearranging
                 instructions in a double-width memory",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "2",
  pages =        "15--27",
  month =        apr,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:40 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Lee:1991:FPP,
  author =       "Roland L. Lee and Alex Y. Kwok and Fay{\'e} A.
                 Briggs",
  title =        "The floating point performance of a superscalar
                 {SPARC} processor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "2",
  pages =        "28--37",
  month =        apr,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:40 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Callahan:1991:SP,
  author =       "David Callahan and Ken Kennedy and Allan Porterfield",
  title =        "Software prefetching",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "2",
  pages =        "40--52",
  month =        apr,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:40 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Sohi:1991:HBD,
  author =       "Gurindar S. Sohi and Manoj Franklin",
  title =        "High-bandwidth data memory systems for superscalar
                 processors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "2",
  pages =        "53--62",
  month =        apr,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:40 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Lam:1991:CPO,
  author =       "Monica D. Lam and Edward E. Rothberg and Michael E.
                 Wolf",
  title =        "The cache performance and optimizations of blocked
                 algorithms",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "2",
  pages =        "63--74",
  month =        apr,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:40 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Mogul:1991:ECS,
  author =       "Jeffrey C. Mogul and Anita Borg",
  title =        "The effect of context switches on cache performance",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "2",
  pages =        "75--84",
  month =        apr,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:40 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Keppel:1991:PIF,
  author =       "David Keppel",
  title =        "A portable interface for on-the-fly instruction space
                 modification",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "2",
  pages =        "86--95",
  month =        apr,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:40 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Appel:1991:VMP,
  author =       "Andrew W. Appel and Kai Li",
  title =        "Virtual memory primitives for user programs",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "2",
  pages =        "96--107",
  month =        apr,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:40 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Anderson:1991:IAO,
  author =       "Thomas E. Anderson and Henry M. Levy and Brian N.
                 Bershad and Edward D. Lazowska",
  title =        "The interaction of architecture and operating system
                 design",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "2",
  pages =        "108--120",
  month =        apr,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:40 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Bradlee:1991:IRA,
  author =       "David G. Bradlee and Susan J. Eggers and Robert R.
                 Henry",
  title =        "Integrating register allocation and instruction
                 scheduling for {RISCs}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "2",
  pages =        "122--131",
  month =        apr,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:40 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Benitez:1991:CGS,
  author =       "Manuel E. Benitez and Jack W. Davidson",
  title =        "Code generation for streaming: an access\slash execute
                 mechanism",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "2",
  pages =        "132--141",
  month =        apr,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:40 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Bagrodia:1991:EIH,
  author =       "Rajive Bagrodia and Sharad Mathur",
  title =        "Efficient {Implementation} of high-level parallel
                 programs",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "2",
  pages =        "142--151",
  month =        apr,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:40 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Mangione-Smith:1991:VRD,
  author =       "William Mangione-Smith and Santosh G. Abraham and
                 Edward S. Davidson",
  title =        "Vector register design for polycyclic vector
                 scheduling",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "2",
  pages =        "154--163",
  month =        apr,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:40 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Culler:1991:FGP,
  author =       "David E. Culler and Anurag Sah and Klaus E. Schauser
                 and Thorsten von Eicken and John Wawrzynek",
  title =        "Fine-grain parallelism with minimal hardware support:
                 a compiler-controlled threaded abstract machine",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "2",
  pages =        "164--175",
  month =        apr,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:40 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Wall:1991:LIL,
  author =       "David W. Wall",
  title =        "Limits of instruction-level parallelism",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "2",
  pages =        "176--188",
  month =        apr,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:40 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Lee:1991:PCP,
  author =       "Edward K. Lee and Randy H. Katz",
  title =        "Performance consequences of parity placement in disk
                 arrays",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "2",
  pages =        "190--199",
  month =        apr,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:40 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Cate:1991:CCC,
  author =       "Vincent Cate and Thomas Gross",
  title =        "Combining the concepts of compression and caching for
                 a two-level filesystem",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "2",
  pages =        "200--211",
  month =        apr,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:40 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Bolosky:1991:NPT,
  author =       "William J. Bolosky and Michael L. Scott and Robert P.
                 Fitzgerald and Robert J. Fowler and Alan L. Cox",
  title =        "{NUMA} policies and their relation to memory
                 architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "2",
  pages =        "212--221",
  month =        apr,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:40 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Chaiken:1991:LDS,
  author =       "David Chaiken and John Kubiatowicz and Anant Agarwal",
  title =        "{LimitLESS} directories: a scalable cache coherence
                 scheme",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "2",
  pages =        "224--234",
  month =        apr,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:40 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Min:1991:ECB,
  author =       "Sang L. Min and Jong-Deok Choi",
  title =        "An efficient cache-based access anomaly detection
                 scheme",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "2",
  pages =        "235--244",
  month =        apr,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:40 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Gharachorloo:1991:PEM,
  author =       "Kourosh Gharachorloo and Anoop Gupta and John
                 Hennessy",
  title =        "Performance evaluation of memory consistency models
                 for shared-memory multiprocessors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "2",
  pages =        "245--257",
  month =        apr,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:40 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Freudenthal:1991:PCF,
  author =       "Eric Freudenthal and Allan Gottlieb",
  title =        "Process coordination with fetch-and-increment",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "2",
  pages =        "260--268",
  month =        apr,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:40 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Mellor-Crummey:1991:SC,
  author =       "John M. Mellor-Crummey and Michael L. Scott",
  title =        "Synchronization without contention",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "2",
  pages =        "269--278",
  month =        apr,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:40 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Johnson:1991:CRB,
  author =       "Douglas Johnson",
  title =        "The case for a read barrier",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "2",
  pages =        "279--287",
  month =        apr,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:40 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Cmelik:1991:AMS,
  author =       "Robert F. Cmelik and Shing I. Kong and David R. Ditzel
                 and Edmund J. Kelly",
  title =        "An analysis of {MIPS} and {SPARC} instruction set
                 utilization on the {SPEC} benchmarks",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "2",
  pages =        "290--302",
  month =        apr,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:40 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hall:1991:PCA,
  author =       "C. Brian Hall and Kevin O'Brien",
  title =        "Performance characteristics of architectural features
                 of the {IBM RISC System\slash 6000}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "2",
  pages =        "303--309",
  month =        apr,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:40 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Bhandarkar:1991:PAC,
  author =       "Dileep Bhandarkar and Douglas W. Clark",
  title =        "Performance from architecture: comparing a {RISC} and
                 a {CISC} with similar hardware organization",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "2",
  pages =        "310--319",
  month =        apr,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:40 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{DeMara:1991:SPA,
  author =       "R. F. DeMara and D. I. Moldovan",
  title =        "The {SNAP-1} parallel {AI} prototype",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "3",
  pages =        "2--11",
  month =        may,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:01 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Tan:1991:GEN,
  author =       "Wei Siong Tan and H. Russ and Cecil O. Alford",
  title =        "{GT-EP}: a novel high-performance real-time
                 architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "3",
  pages =        "13--21",
  month =        may,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:01 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Higuchi:1991:IPA,
  author =       "Tetsuya Higuchi and Tatsumi Furuya and Kenichi Handa
                 and Naoto Takahashi and Hiroyasu Nishiyama and Akio
                 Kokubu",
  title =        "{IXM2}: a parallel associative processor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "3",
  pages =        "22--31",
  month =        may,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:01 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kaeli:1991:BHT,
  author =       "David R. Kaeli and Philip G. Emma",
  title =        "Branch history table prediction of moving target
                 branches due to subroutine returns",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "3",
  pages =        "34--42",
  month =        may,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:01 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Klaiber:1991:ASC,
  author =       "Alexander C. Klaiber and Henry M. Levy",
  title =        "An architecture for software-controlled data
                 prefetching",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "3",
  pages =        "43--53",
  month =        may,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:01 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Fu:1991:DPM,
  author =       "John W. C. Fu and Janak H. Patel",
  title =        "Data prefetching in multiprocessor vector cache
                 memories",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "3",
  pages =        "54--63",
  month =        may,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:01 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Harper:1991:RMC,
  author =       "D. T. {Harper III}",
  title =        "Reducing memory contention in shared memory
                 multiprocessors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "3",
  pages =        "66--73",
  month =        may,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:01 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Rau:1991:PRI,
  author =       "B. Ramakrishna Rau",
  title =        "Pseudo-randomly interleaved memory",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "3",
  pages =        "74--83",
  month =        may,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:01 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Li:1991:EMS,
  author =       "Kai Li and Karin Petersen",
  title =        "Evaluation of memory system extensions",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "3",
  pages =        "84--93",
  month =        may,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:01 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Dowd:1991:HPI,
  author =       "Patrick W. Dowd",
  title =        "High performance interprocessor communication through
                 optical wavelength division multiple access channels",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "3",
  pages =        "96--105",
  month =        may,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:01 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Landin:1991:RFI,
  author =       "Anders Landin and Erik Hagersten and Seif Haridi",
  title =        "Race-free interconnection networks and multiprocessor
                 consistency",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "3",
  pages =        "106--115",
  month =        may,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:01 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Lin:1991:DFM,
  author =       "Xiaola Lin and Lionel M. Ni",
  title =        "Deadlock-free multicast wormhole routing in
                 multicomputer networks",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "3",
  pages =        "116--125",
  month =        may,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:01 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Farrens:1991:DBR,
  author =       "Matthew Farrens and Arvin Park",
  title =        "Dynamic base register caching: a technique for
                 reducing address bus width",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "3",
  pages =        "128--137",
  month =        may,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:01 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Olukotun:1991:ICH,
  author =       "O. A. Olukotun and T. N. Mudge and R. B. Brown",
  title =        "Implementing a cache for a high-performance {GaAs}
                 microprocessor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "3",
  pages =        "138--147",
  month =        may,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:01 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kurian:1991:CPE,
  author =       "Lizyamma Kurian and Paul T. Hulina and Lee D. Coraor
                 and Dhamir N. Mannai",
  title =        "Classification and performance evaluation of
                 instruction buffering techniques",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "3",
  pages =        "150--159",
  month =        may,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:01 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Nakajima:1991:OVS,
  author =       "Masaitsu Nakajima and Hiraku Nakano and Yasuhiro
                 Nakakura and Tadahiro Yoshida and Yoshiyuki Goi and
                 Yuji Nakai and Reiji Segawa and Takeshi Kishida and
                 Hiroshi Kadota",
  title =        "{OHMEGA}: a {VLSI} superscalar processor architecture
                 for numerical applications",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "3",
  pages =        "160--168",
  month =        may,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:01 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Vajapeyam:1991:ESC,
  author =       "Sriram Vajapeyam and Gurindar S. Sohi and Wei-Chung
                 Hsu",
  title =        "An empirical study of the {CRAY Y-MP} processor using
                 the {Perfect Club} benchmarks",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "3",
  pages =        "170--179",
  month =        may,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:01 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Stephens:1991:ILP,
  author =       "Chriss Stephens and Bryce Cogswell and John Heinlein
                 and Gregory Palmer and John P. Shen",
  title =        "Instruction level profiling and evaluation of the
                 {IBM\slash 6000}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "3",
  pages =        "180--189",
  month =        may,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:01 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Dimpsey:1991:PPT,
  author =       "R. T. Dimpsey and R. K. Iyer",
  title =        "Performance prediction and tuning on a
                 multiprocessor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "3",
  pages =        "190--199",
  month =        may,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:01 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Oehlrich:1991:PEC,
  author =       "C. W. Oehlrich and A. Quick",
  title =        "Performance evaluation of a communication system for
                 transputer-networks based on monitored event traces",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "3",
  pages =        "202--211",
  month =        may,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:01 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Konstantinidou:1991:CRA,
  author =       "S. Konstantinidou and L. Snyder",
  title =        "Chaos router: architecture and performance",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "3",
  pages =        "212--221",
  month =        may,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:01 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Shukla:1991:SPC,
  author =       "Shridhar B. Shukla and Dharma P. Agrawal",
  title =        "Scheduling pipelined communication in distributed
                 memory multiprocessors for real-time applications",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "3",
  pages =        "222--231",
  month =        may,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:01 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Adve:1991:DDR,
  author =       "Sarita V. Adve and Mark D. Hill and Barton P. Miller
                 and Robert H. B. Netzer",
  title =        "Detecting data races on weak memory systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "3",
  pages =        "234--243",
  month =        may,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:01 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Koldinger:1991:VTD,
  author =       "Eric J. Koldinger and Susan J. Eggers and Henry M.
                 Levy",
  title =        "On the validity of trace-driven simulation for
                 multiprocessors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "3",
  pages =        "244--253",
  month =        may,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:01 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Gupta:1991:CEL,
  author =       "Anoop Gupta and John Hennessy and Kourosh Gharachorloo
                 and Todd Mowry and Wolf-Dietrich Weber",
  title =        "Comparative evaluation of latency reducing and
                 tolerating techniques",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "3",
  pages =        "254--263",
  month =        may,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:01 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Chang:1991:IAF,
  author =       "Pohua P. Chang and Scott A. Mahlke and William Y. Chen
                 and Nancy J. Warter and Wen-mei W. Hwu",
  title =        "{IMPACT}: an architectural framework for
                 multiple-instruction-issue processors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "3",
  pages =        "266--275",
  month =        may,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:01 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Butler:1991:SIS,
  author =       "Michael Butler and Tse-Yu Yeh and Yale Patt and Mitch
                 Alsup and Hunter Scales and Michael Shebanow",
  title =        "Single instruction stream parallelism is greater than
                 two",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "3",
  pages =        "276--286",
  month =        may,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:01 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Melvin:1991:EFG,
  author =       "Stephen Melvin and Yale Patt",
  title =        "Exploiting fine-grained parallelism through a
                 combination of hardware and software techniques",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "3",
  pages =        "287--296",
  month =        may,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:01 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Adve:1991:CHS,
  author =       "Sarita V. Adve and Vikram S. Adve and Mark D. Hill and
                 Mary K. Vernon",
  title =        "Comparison of hardware and software cache coherence
                 schemes",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "3",
  pages =        "298--308",
  month =        may,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:01 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Simoni:1991:MPL,
  author =       "Richard Simoni and Mark Horowitz",
  title =        "Modeling the performance of limited pointers
                 directories for cache coherence",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "3",
  pages =        "309--319",
  month =        may,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:01 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Quammen:1991:FRM,
  author =       "Donna J. Quammen and D. Richard Miller",
  title =        "Flexible register management for sequential programs",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "3",
  pages =        "320--329",
  month =        may,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:01 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Bradlee:1991:ERP,
  author =       "David G. Bradlee and Susan J. Eggers and Robert R.
                 Henry",
  title =        "The effect on {RISC} performance of register set size
                 and structure versus code generation strategy",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "3",
  pages =        "330--339",
  month =        may,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:01 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Papadopoulos:1991:MRV,
  author =       "Gregory M. Papadopoulos and Kenneth R. Traub",
  title =        "Multithreading: a revisionist view of dataflow
                 architectures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "3",
  pages =        "342--351",
  month =        may,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:01 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Chiueh:1991:MTV,
  author =       "Tzi-cker Chiueh",
  title =        "Multi-threaded vectorization",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "3",
  pages =        "352--361",
  month =        may,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:01 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Farrens:1991:SAI,
  author =       "Matthew K. Farrens and Andrew R. Pleszkun",
  title =        "Strategies for achieving improved processor
                 throughput",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "3",
  pages =        "362--369",
  month =        may,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:01 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kagimasa:1991:ASM,
  author =       "Toyohiko Kagimasa and Kikuo Takahashi and Toshiaki
                 Mori and Seiichi Yoshizumi",
  title =        "Adaptive storage management for very large
                 virtual\slash real storage systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "3",
  pages =        "372--379",
  month =        may,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:01 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hall:1991:VVA,
  author =       "Judith S. Hall and Paul T. Robinson",
  title =        "Virtualizing the {VAX} architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "3",
  pages =        "380--389",
  month =        may,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:01 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Akella:1991:MMI,
  author =       "Janaki Akella and Daniel P. Siewiorek",
  title =        "Modeling and measurement of the impact of {Input\slash
                 Output} on system performance",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "3",
  pages =        "390--399",
  month =        may,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:01 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Wilson:1991:PSP,
  author =       "Paul R. Wilson",
  title =        "Pointer swizzling at page fault time: efficiently
                 supporting huge address spaces on standard hardware",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "4",
  pages =        "6--13",
  month =        jun,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:06 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kuga:1991:DDH,
  author =       "Morihiro Kuga and Kazuaki Murakami and Shinji Tomita",
  title =        "{DSNS} (dynamically-hazard-resolved
                 statically-code-scheduled, nonuniform superscalar): yet
                 another superscalar processor architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "4",
  pages =        "14--29",
  month =        jun,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:06 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ponder:1991:PVA,
  author =       "Carl Ponder",
  title =        "Performance variation across benchmark suites",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "4",
  pages =        "30--36",
  month =        jun,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:06 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Conte:1991:BSB,
  author =       "Thomas M. Conte and Wen-mei W. Hwu",
  title =        "A brief survey of benchmark usage in the architecture
                 community",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "4",
  pages =        "37--44",
  month =        jun,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:06 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Morris:1991:CER,
  author =       "Todd D. Morris and Edward F. Gehringer",
  title =        "A cost-effective reliable multipath interconnection
                 network",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "4",
  pages =        "45--65",
  month =        jun,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:06 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Laplante:1991:ICB,
  author =       "P. A. Laplante",
  title =        "An improved conditional branching scheme for a single
                 instruction computer architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "4",
  pages =        "66--68",
  month =        jun,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:06 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{DuBois:1991:DED,
  author =       "Andrew J. DuBois and John Rasure",
  title =        "Design and evaluation of a distributed asynchronous
                 {VLSI} crossbar switch controller for a packet switched
                 supercomputer network",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "4",
  pages =        "69--79",
  month =        jun,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:06 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Lass:1991:CCP,
  author =       "Stanley E. Lass",
  title =        "The compiler controlled pack cache and messaging",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "4",
  pages =        "80--85",
  month =        jun,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:06 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ungerer:1991:MLP,
  author =       "Theo Ungerer and Eberhard Zehendner",
  title =        "A multi-level parallelism architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "4",
  pages =        "86--93",
  month =        jun,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:06 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Matthes:1991:HMO,
  author =       "Wolfgang Matthes",
  title =        "How many operation units are adequate?",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "4",
  pages =        "94--108",
  month =        jun,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:06 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Cunha:1991:AMM,
  author =       "Alberto R. Cunha and Carlos N. Ribeiro and Jos{\'e} A.
                 Marques",
  title =        "The architecture of a memory management unit for
                 object-oriented systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "4",
  pages =        "109--116",
  month =        jun,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:06 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Matloff:1991:AAS,
  author =       "Norman Matloff",
  title =        "An argument against scalable cache coherency",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "4",
  pages =        "117--123",
  month =        jun,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:06 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Rodohan:1991:OAO,
  author =       "D. P. Rodohan and R. J. Glover",
  title =        "An overview of the {A} architecture for optimisation
                 problems in a logic programming environment",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "4",
  pages =        "124--131",
  month =        jun,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:06 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Wray:1991:TSD,
  author =       "Stuart C. Wray",
  title =        "Time-sequenced {DMA} for multimedia computers",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "4",
  pages =        "132--137",
  month =        jun,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:06 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ramamoorthy:1991:BMC,
  author =       "Ganesh Ramamoorthy and Alok N. Choudhary",
  title =        "A bibliography for multiprocessor cache memories",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "4",
  pages =        "138--153",
  month =        jun,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:06 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Smith:1991:SBC,
  author =       "Alan Jay Smith",
  title =        "Second bibliography on {Cache} memories",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "4",
  pages =        "154--182",
  month =        jun,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:06 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Thorson:1991:UNb,
  author =       "Mark Thorson",
  title =        "{Usenet} Nuggets",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "4",
  pages =        "185--191",
  month =        jun,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:06 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Patterson:1991:TGS,
  author =       "David A. Patterson",
  title =        "Towards guidelines for {SIGARCH} sponsored
                 conferences",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "5",
  pages =        "7--7",
  month =        sep,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:26 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Maa:1991:TED,
  author =       "Yeong-Chang Maa and Dhiraj K. Pradhan and Dominique
                 Thi{\'e}baut",
  title =        "Two economical directory schemes for large-scale cache
                 coherent multiprocessors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "5",
  pages =        "10--10",
  month =        sep,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:26 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Thorson:1991:UNc,
  author =       "Mark Thorson",
  title =        "{Usenet} Nuggets",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "5",
  pages =        "21--26",
  month =        sep,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:26 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ivanovic:1991:BRC,
  author =       "Vladimir G. Ivanovic",
  title =        "Book review: {{\em Computation Structures\/}} by
                 {Stephen A Ward and Robert H. Halstead, Jr. (MIT Press
                 or McGraw-Hill, 1990)}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "5",
  pages =        "27--29",
  month =        sep,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:26 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Krieger:1991:BRM,
  author =       "Moshe Krieger",
  title =        "Book review: {{\em Multiprocessors\/}} by {D. Tabak
                 (Prentice-Hall, 1990)}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "5",
  pages =        "27--29",
  month =        sep,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:26 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Fulcher:1991:BRM,
  author =       "John Fulcher",
  title =        "Book review: {{\em The 68000 and 68020
                 Microprocessors: Hardware, Software and Interfacing
                 Techniques\/}} by {W. Triebel and A. Singh (Prentice
                 Hall, 1991)}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "5",
  pages =        "29--30",
  month =        sep,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:26 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Baker:1991:PIS,
  author =       "Henry G. Baker",
  title =        "Precise instruction scheduling without a precise
                 machine model",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "6",
  pages =        "4--8",
  month =        dec,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:27 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{McLaughlin:1991:LAB,
  author =       "Robert McLaughlin",
  title =        "Look-ahead branching hardware",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "6",
  pages =        "9--11",
  month =        dec,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:27 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Beth:1991:RCI,
  author =       "Thomas Beth and Volker Hatz",
  title =        "A restricted crossbar implementation and its
                 applications",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "6",
  pages =        "12--16",
  month =        dec,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:27 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Thorson:1991:UNd,
  author =       "Mark Thorson",
  title =        "{Usenet} nuggets",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "6",
  pages =        "19--23",
  month =        dec,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:27 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Bernecky:1991:BRP,
  author =       "Robert Bernecky",
  title =        "Book review: {{\em Past, Present, Parallel: A Survey
                 of Available Parallel Computing Systems\/}} by {Arthur
                 Trew \& Greg Wilson (Eds.), (Springer-Verlag 1991)}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "19",
  number =       "6",
  pages =        "24--25",
  month =        dec,
  year =         "1991",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:27 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Singh:1992:SSP,
  author =       "Jaswinder Pal Singh and Wolf-Dietrich Weber and Anoop
                 Gupta",
  title =        "{SPLASH}: {Stanford} parallel applications for
                 shared-memory",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "20",
  number =       "1",
  pages =        "5--44",
  month =        mar,
  year =         "1992",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:33 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Wajda:1992:SSP,
  author =       "Eligiusz Wajda",
  title =        "{SPIRE}: streaming processing with instructions
                 release element",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "20",
  number =       "1",
  pages =        "45--54",
  month =        mar,
  year =         "1992",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:33 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Deville:1992:CRP,
  author =       "Yannick Deville and Jean Gobert",
  title =        "A class of replacement policies for medium and
                 high-associativity structures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "20",
  number =       "1",
  pages =        "55--64",
  month =        mar,
  year =         "1992",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:33 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Zucker:1992:PSM,
  author =       "Richard N. Zucker and Jean-Loup Baer",
  title =        "A performance study of memory consistency models",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "20",
  number =       "2",
  pages =        "2--12",
  month =        may,
  year =         "1992",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Keleher:1992:LRC,
  author =       "Pete Keleher and Alan L. Cox and Willy Zwaenepoel",
  title =        "Lazy release consistency for software distributed
                 shared memory",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "20",
  number =       "2",
  pages =        "13--21",
  month =        may,
  year =         "1992",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Gharachorloo:1992:HML,
  author =       "Kourosh Gharachorloo and Anoop Gupta and John
                 Hennessy",
  title =        "Hiding memory latency using dynamic scheduling in
                 shared-memory multiprocessors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "20",
  number =       "2",
  pages =        "22--33",
  month =        may,
  year =         "1992",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Fernandes:1992:EBB,
  author =       "Edil S. T. Fernandes and Fernando M. B. Barbosa",
  title =        "Effects of building blocks on the performance of
                 super-scalar architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "20",
  number =       "2",
  pages =        "36--45",
  month =        may,
  year =         "1992",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Lam:1992:LCF,
  author =       "Monica S. Lam and Robert P. Wilson",
  title =        "Limits of control flow on parallelism",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "20",
  number =       "2",
  pages =        "46--57",
  month =        may,
  year =         "1992",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Franklin:1992:ESW,
  author =       "Manoj Franklin and Gurindar S. Sohi",
  title =        "The expandable split window paradigm for exploiting
                 fine-grain parallelism",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "20",
  number =       "2",
  pages =        "58--67",
  month =        may,
  year =         "1992",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Litaize:1992:TSM,
  author =       "Daniel Litaize and Abdelaziz Mzoughi and Christine
                 Rochange and Pascal Sainrat",
  title =        "Towards a shared-memory massively parallel
                 multiprocessor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "20",
  number =       "2",
  pages =        "70--79",
  month =        may,
  year =         "1992",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Stenstrom:1992:CPE,
  author =       "Per Stenstr{\"o}m and Truman Joe and Anoop Gupta",
  title =        "Comparative performance evaluation of cache-coherent
                 {NUMA} and {COMA} architectures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "20",
  number =       "2",
  pages =        "80--91",
  month =        may,
  year =         "1992",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Lenoski:1992:DPI,
  author =       "Daniel Lenoski and James Laudon and Truman Joe and
                 David Nakahira and Luis Stevens and Anoop Gupta and
                 John Hennessy",
  title =        "The {DASH} prototype: implementation and performance",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "20",
  number =       "2",
  pages =        "92--103",
  month =        may,
  year =         "1992",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Intrater:1992:PED,
  author =       "Gideon Intrater and Ilan Spillinger",
  title =        "Performance evaluation of a decoded instruction cache
                 for variable instruction-length computers",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "20",
  number =       "2",
  pages =        "106--113",
  month =        may,
  year =         "1992",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Chen:1992:SBS,
  author =       "J. Bradley Chen and Anita Borg and Norman P. Jouppi",
  title =        "A simulation based study of {TLB} performance",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "20",
  number =       "2",
  pages =        "114--123",
  month =        may,
  year =         "1992",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Yeh:1992:AIT,
  author =       "Tse-Yu Yeh and Yale N. Patt",
  title =        "Alternative implementations of two-level adaptive
                 branch prediction",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "20",
  number =       "2",
  pages =        "124--134",
  month =        may,
  year =         "1992",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hirata:1992:EPA,
  author =       "Hiroaki Hirata and Kozo Kimura and Satoshi Nagamine
                 and Yoshiyuki Mochizuki and Akio Nishimura and
                 Yoshimori Nakase and Teiji Nishizawa",
  title =        "An elementary processor architecture with simultaneous
                 instruction issuing from multiple threads",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "20",
  number =       "2",
  pages =        "136--145",
  month =        may,
  year =         "1992",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Sato:1992:TBP,
  author =       "Mitsuhisa Sato and Yuetsu Kodama and Shuichi Sakai and
                 Yoshinori Yamaguchi and Yasuhito Koumura",
  title =        "Thread-based programming for the {EM-4} hybrid
                 dataflow machine",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "20",
  number =       "2",
  pages =        "146--155",
  month =        may,
  year =         "1992",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Nikhil:1992:MMP,
  author =       "R. S. Nikhil and G. M. Papadopoulos and Arvind",
  title =        "{T}: a multithreaded massively parallel architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "20",
  number =       "2",
  pages =        "156--167",
  month =        may,
  year =         "1992",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Dubnicki:1992:ABS,
  author =       "Czarek Dubnicki and Thomas J. LeBlanc",
  title =        "Adjustable block size coherent caches",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "20",
  number =       "2",
  pages =        "170--180",
  month =        may,
  year =         "1992",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Olukotun:1992:POP,
  author =       "Kunle Olukotun and Trevor Mudge and Richard Brown",
  title =        "Performance optimization of pipelined primary cache",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "20",
  number =       "2",
  pages =        "181--190",
  month =        may,
  year =         "1992",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{McFarling:1992:CRD,
  author =       "Scott McFarling",
  title =        "Cache replacement with dynamic exclusion",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "20",
  number =       "2",
  pages =        "191--200",
  month =        may,
  year =         "1992",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Keckler:1992:PCI,
  author =       "Stephem W. Keckler and William J. Dally",
  title =        "Processor coupling: integrating compile time and
                 runtime scheduling for parallelism",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "20",
  number =       "2",
  pages =        "202--213",
  month =        may,
  year =         "1992",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Boothe:1992:IMT,
  author =       "Bob Boothe and Abhiram Ranade",
  title =        "Improved multithreading techniques for hiding
                 communication latency in multiprocessors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "20",
  number =       "2",
  pages =        "214--223",
  month =        may,
  year =         "1992",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{DeGloria:1992:ILP,
  author =       "Alessandro {De Gloria} and Paolo Faraboschi",
  title =        "Instruction-level parallelism in {Prolog}: analysis
                 and architectural support",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "20",
  number =       "2",
  pages =        "224--233",
  month =        may,
  year =         "1992",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kurian:1992:MLE,
  author =       "Lizyamma Kurian and Paul T. Hulina and Lee D. Coraor",
  title =        "Memory latency effects in decoupled architectures with
                 a single data memory module",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "20",
  number =       "2",
  pages =        "236--245",
  month =        may,
  year =         "1992",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Seznec:1992:IPS,
  author =       "Andr{\'e} Seznec and Jacques Lenfant",
  title =        "Interleaved parallel schemes: improving memory
                 throughput on supercomputers",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "20",
  number =       "2",
  pages =        "246--255",
  month =        may,
  year =         "1992",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{vonEicken:1992:AMM,
  author =       "Thorsten von Eicken and David E. Culler and Seth Copen
                 Goldstein and Klaus Erik Schauser",
  title =        "Active messages: a mechanism for integrated
                 communication and computation",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "20",
  number =       "2",
  pages =        "256--266",
  month =        may,
  year =         "1992",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Chien:1992:PAR,
  author =       "Andrew A. Chien and Jae H. Kim",
  title =        "Planar-adaptive routing: low-cost adaptive networks
                 for multiprocessors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "20",
  number =       "2",
  pages =        "268--277",
  month =        may,
  year =         "1992",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Glass:1992:TMA,
  author =       "Christopher J. Glass and Lionel M. Ni",
  title =        "The turn model for adaptive routing",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "20",
  number =       "2",
  pages =        "278--287",
  month =        may,
  year =         "1992",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Shimizu:1992:LLM,
  author =       "Toshiyuki Shimizu and Takeshi Horie and Hiroaki
                 Ishihata",
  title =        "Low-latency message communication support for the
                 {AP1000}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "20",
  number =       "2",
  pages =        "288--297",
  month =        may,
  year =         "1992",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Aichinger:1992:FBP,
  author =       "Barbara P. Aichinger",
  title =        "{Futurebus+} as an {I/O} bus: profile {B}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "20",
  number =       "2",
  pages =        "300--307",
  month =        may,
  year =         "1992",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Reddy:1992:SSO,
  author =       "A. L. Narasimha Reddy",
  title =        "A study of {I/O} system organizations",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "20",
  number =       "2",
  pages =        "308--317",
  month =        may,
  year =         "1992",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Menon:1992:CSA,
  author =       "Jai Menon and Dick Mattson",
  title =        "Comparison of sparing alternatives for disk arrays",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "20",
  number =       "2",
  pages =        "318--329",
  month =        may,
  year =         "1992",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Siegle:1992:MPB,
  author =       "Markus Siegle and Richard Hofmann",
  title =        "Monitoring program behaviour on {SUPRENUM}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "20",
  number =       "2",
  pages =        "332--341",
  month =        may,
  year =         "1992",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Austin:1992:DDA,
  author =       "Todd M. Austin and Gurindar S. Sohi",
  title =        "Dynamic dependency analysis of ordinary programs",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "20",
  number =       "2",
  pages =        "342--351",
  month =        may,
  year =         "1992",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Najjar:1992:ALL,
  author =       "Walid A. Najjar and W. Marcus Miller and A. P. Wim
                 B{\"o}hm",
  title =        "An analysis of loop latency in dataflow execution",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "20",
  number =       "2",
  pages =        "352--360",
  month =        may,
  year =         "1992",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Yang:1992:NCD,
  author =       "Qing Yang and Liping Wu Yang",
  title =        "A novel cache design for vector processing",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "20",
  number =       "2",
  pages =        "362--371",
  month =        may,
  year =         "1992",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Valero:1992:INS,
  author =       "Mateo Valero and Tom{\'a}s Lang and Jos{\'e} M.
                 Llaber{\'\i}a and Montse Peiron and Eduard Ayguad{\'e}
                 and Juan J. Navarra",
  title =        "Increasing the number of strides for conflict-free
                 vector access",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "20",
  number =       "2",
  pages =        "372--381",
  month =        may,
  year =         "1992",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Wulf:1992:EWA,
  author =       "Wm. A. Wulf",
  title =        "Evaluation of the {WM} architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "20",
  number =       "2",
  pages =        "382--390",
  month =        may,
  year =         "1992",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Johnson:1992:ICL,
  author =       "Kirk L. Johnson",
  title =        "The impact of communication locality on large-scale
                 multiprocessor performance",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "20",
  number =       "2",
  pages =        "392--402",
  month =        may,
  year =         "1992",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Scott:1992:PSR,
  author =       "Steven L. Scott and James R. Goodman and Mary K.
                 Vernon",
  title =        "Performance of the {SCI} ring",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "20",
  number =       "2",
  pages =        "403--414",
  month =        may,
  year =         "1992",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Talluri:1992:TST,
  author =       "Madhusudhan Talluri and Shing Kong and Mark D. Hill
                 and David A. Patterson",
  title =        "Tradeoffs in supporting two page sizes",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "20",
  number =       "2",
  pages =        "415--424",
  month =        may,
  year =         "1992",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Louri:1992:PEO,
  author =       "Ahmed Louri and Jongwhoa Na",
  title =        "Parallel electro-optical rule-based system for fast
                 execution of expert systems (abstract)",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "20",
  number =       "2",
  pages =        "427--427",
  month =        may,
  year =         "1992",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Seznec:1992:OAF,
  author =       "Andr{\'e} Seznec and Karl Courtel",
  title =        "{OPAC} (abstract): a floating-point coprocessor
                 dedicated to compute-bound kernels",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "20",
  number =       "2",
  pages =        "427--427",
  month =        may,
  year =         "1992",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Cheng:1992:TCB,
  author =       "Der-Chung Cheng and Kanad Ghose",
  title =        "The time-constrained barrier synchronizer and its
                 applications in parallel systems (abstract)",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "20",
  number =       "2",
  pages =        "428--428",
  month =        may,
  year =         "1992",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Louri:1992:NCD,
  author =       "Ahmed Louri and Hongki Sung",
  title =        "A new compiler-directed cache coherence scheme for
                 shared memory multiprocessors with fast and parallel
                 explicit invalidation (abstract)",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "20",
  number =       "2",
  pages =        "428--428",
  month =        may,
  year =         "1992",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Singh:1992:AGP,
  author =       "Gautam B. Singh",
  title =        "Architecture of a graphics processor (abstract)",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "20",
  number =       "2",
  pages =        "429--429",
  month =        may,
  year =         "1992",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Yomtov:1992:PED,
  author =       "Ruben Yomtov",
  title =        "Performance evaluation of disk subsystems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "20",
  number =       "2",
  pages =        "429--429",
  month =        may,
  year =         "1992",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Lai:1992:EBS,
  author =       "Feipei Lai and Meng-chou Chang",
  title =        "Enhancing boosting with semantic register in a
                 superscalar processor (abstract)",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "20",
  number =       "2",
  pages =        "430--430",
  month =        may,
  year =         "1992",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Sklenar:1992:PUVa,
  author =       "Ivan Sklenar",
  title =        "Prefetch unit for vector operations on scalar
                 computers (abstract)",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "20",
  number =       "2",
  pages =        "430--430",
  month =        may,
  year =         "1992",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Newman:1992:MMSa,
  author =       "Gary Newman",
  title =        "Memory management support for tiled array organization
                 (abstract)",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "20",
  number =       "2",
  pages =        "431--431",
  month =        may,
  year =         "1992",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Uht:1992:DPI,
  author =       "Augustus K. Uht and Darin B. Johnson",
  title =        "Data path issues in a highly concurrent machine
                 (abstract)",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "20",
  number =       "2",
  pages =        "431--431",
  month =        may,
  year =         "1992",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Fineberg:1992:SLT,
  author =       "Samuel A. Fineberg and Thomas L. Casavant and Brent H.
                 Pease",
  title =        "Seamless --- a latency-tolerant {RISC}-based
                 multiprocessor architecture (abstract)",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "20",
  number =       "2",
  pages =        "432--432",
  month =        may,
  year =         "1992",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Sayeed:1992:PMB,
  author =       "M. A. Sayeed and M. Atiquzzaman",
  title =        "Performance of multiple-bus multiprocessor under
                 non-uniform memory reference model (abstract)",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "20",
  number =       "2",
  pages =        "432--432",
  month =        may,
  year =         "1992",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kechadi:1992:PIV,
  author =       "M. Tahar Kechadi and J-L. Dekeyser and Ph. Marquet and
                 Ph. Preux",
  title =        "Performance improvement for vector pipeline
                 multiprocessor systems using a disordered execution
                 model(abstract)",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "20",
  number =       "2",
  pages =        "433--433",
  month =        may,
  year =         "1992",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Varma:1992:CPS,
  author =       "Anujan Varma and Gunjan Sinha",
  title =        "A class of prefetch schemes for on-chip data caches",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "20",
  number =       "2",
  pages =        "433--433",
  month =        may,
  year =         "1992",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Abnous:1992:PBV,
  author =       "Arthur Abnous and Nader Bagherzadeh",
  title =        "Pipelining and bypassing in a {VLIW} processor
                 (abstract)",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "20",
  number =       "2",
  pages =        "434--434",
  month =        may,
  year =         "1992",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Prakash:1992:SAS,
  author =       "Shiv Prakash and Alice C. Parker",
  title =        "Synthesis of application-specific heterogeneous
                 multiprocessor systems (abstract)",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "20",
  number =       "2",
  pages =        "434--434",
  month =        may,
  year =         "1992",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Farrens:1992:PTL,
  author =       "Matthew Farrens and Arvin Park and Rob Fanfelle and
                 Pius Ng and Gary Tyson",
  title =        "A partitioned translation lookaside buffer approach to
                 reducing address bandwidth (abstract)",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "20",
  number =       "2",
  pages =        "435--435",
  month =        may,
  year =         "1992",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Laudon:1992:AIT,
  author =       "James Laudon and Anoop Gupta and Mark Horowitz",
  title =        "Architectural and implementation tradeoffs in the
                 design of multiple-context processors (abstract)",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "20",
  number =       "2",
  pages =        "435--435",
  month =        may,
  year =         "1992",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Alleyne:1992:EDN,
  author =       "Brian D. Alleyne and Isaac D. Scherson",
  title =        "Expanded delta networks for very large parallel
                 computers",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "20",
  number =       "2",
  pages =        "436--436",
  month =        may,
  year =         "1992",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Singh:1992:IHB,
  author =       "Jaswinder Pal Singh",
  title =        "Implications of hierarchical {N-body} methods for
                 multiprocessor architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "20",
  number =       "2",
  pages =        "436--436",
  month =        may,
  year =         "1992",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Michael:1992:DBC,
  author =       "Wisam Michael",
  title =        "Directory-based cache coherency protocol for a
                 ring-connected multiprocessor-array",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "20",
  number =       "2",
  pages =        "437--437",
  month =        may,
  year =         "1992",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Wang:1992:RCD,
  author =       "Wen-Hann Wang and Jim Quinlan and Konrad Lai",
  title =        "Revisit the case for direct-mapped chaches: a case for
                 two-way set-associative level-two caches",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "20",
  number =       "2",
  pages =        "437--437",
  month =        may,
  year =         "1992",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Culler:1992:AMM,
  author =       "David E. Culler and Michial Gunter and James C. Lee",
  title =        "Analysis of multithreaded microprocessors under
                 multiprogramming",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "20",
  number =       "2",
  pages =        "438--438",
  month =        may,
  year =         "1992",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Wittenbrink:1992:CWG,
  author =       "C. M. Wittenbrink and A. K. Somani and C. H. Chen",
  title =        "Cache write generate for high performance parallel
                 processing",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "20",
  number =       "2",
  pages =        "438--438",
  month =        may,
  year =         "1992",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Burkhardt:1992:ICA,
  author =       "Walter H. Burkhardt and Stefan Rust",
  title =        "Integrated computer architecture development system",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "20",
  number =       "2",
  pages =        "439--439",
  month =        may,
  year =         "1992",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:43 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Chevance:1992:EMM,
  author =       "R. J. Chevance",
  title =        "An evaluation methodology for microprocessor and
                 system architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "20",
  number =       "3",
  pages =        "4--13",
  month =        jun,
  year =         "1992",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:56 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Laird:1992:CTC,
  author =       "Michael Laird",
  title =        "A comparison of three current superscalar designs",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "20",
  number =       "3",
  pages =        "14--21",
  month =        jun,
  year =         "1992",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:56 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Dongarra:1992:PVC,
  author =       "Jack J. Dongarra",
  title =        "Performance of various computers using standard linear
                 equations software",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "20",
  number =       "3",
  pages =        "22--44",
  month =        jun,
  year =         "1992",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:56 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Keown:1992:PHR,
  author =       "William F. {Keown, Jr.} and Philip {Koopman, Jr.} and
                 Aaron Collins",
  title =        "Performance of the {HARRIS RTX 2000} stack
                 architecture versus the {Sun 4 SPARC} and the {Sun 3
                 M68020} Architectures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "20",
  number =       "3",
  pages =        "45--52",
  month =        jun,
  year =         "1992",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:56 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Thorson:1992:UNa,
  author =       "Mark Thorson",
  title =        "{Usenet} Nuggets",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "20",
  number =       "3",
  pages =        "56--62",
  month =        jun,
  year =         "1992",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:56 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Chalterjee:1992:BRI,
  author =       "Siddhartha Chalterjee",
  title =        "Book review: {{\em The Impact of Vector and Parallel
                 Architectures on the Gaussian Elimination Algorithm\/}}
                 by {Yves Robert (Manchester University Press and
                 Halsted Press, 1991)}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "20",
  number =       "3",
  pages =        "63--64",
  month =        jun,
  year =         "1992",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:56 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Esponda:1992:GCR,
  author =       "Margarita Esponda and Ra{\'u}l Rojas",
  title =        "A graphical comparison of {RISC} processors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "20",
  number =       "4",
  pages =        "2--8",
  month =        sep,
  year =         "1992",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:12 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Matsui:1992:DRM,
  author =       "Shogo Matsui",
  title =        "Dynamic refresh method for dynamic {RAMs}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "20",
  number =       "4",
  pages =        "9--16",
  month =        sep,
  year =         "1992",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:12 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Park:1992:CRS,
  author =       "Arvin Park and Ron Maeder",
  title =        "Codes to reduce switching transients across {VLSI I/O}
                 pins",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "20",
  number =       "4",
  pages =        "17--21",
  month =        sep,
  year =         "1992",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:12 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Newman:1992:MMSb,
  author =       "Gary Newman",
  title =        "Memory management support for tiled array
                 organization",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "20",
  number =       "4",
  pages =        "22--30",
  month =        sep,
  year =         "1992",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:12 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Sklenar:1992:PUVb,
  author =       "Ivan Sklen{\'a}{\v{r}}",
  title =        "Prefetch unit for vector operations on scalar
                 computers",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "20",
  number =       "4",
  pages =        "31--37",
  month =        sep,
  year =         "1992",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:12 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Malik:1992:ILP,
  author =       "Nadeem Malik and Richard J. Eickemeyer and Stamatis
                 Vassiliadis",
  title =        "Instruction-level parallelism from execution interlock
                 collapsing",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "20",
  number =       "4",
  pages =        "38--43",
  month =        sep,
  year =         "1992",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:12 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Vassiliadis:1992:ASO,
  author =       "Stamatis Vassiliadis and Bart Blaner and Richard J.
                 Eickemeyer",
  title =        "On the attributes of the {SCISM} organization",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "20",
  number =       "4",
  pages =        "44--53",
  month =        sep,
  year =         "1992",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:12 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Thorson:1992:UNb,
  author =       "Mark Thorson",
  title =        "{Usenet} nuggets",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "20",
  number =       "4",
  pages =        "56--64",
  month =        sep,
  year =         "1992",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:12 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Allen:1992:BRC,
  author =       "Ken Allen",
  title =        "Book review: {{\em Computing with Parallel
                 Architectures: T.Node\/}}, edited by {D. Gassilloud and
                 J. C. Grossetie (Kluwer Academic Publishers 1991)}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "20",
  number =       "4",
  pages =        "65--66",
  month =        sep,
  year =         "1992",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:12 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Michael:1992:FMB,
  author =       "Gavin Michael and Andrew Chien",
  title =        "Future multicomputers: beyond minimalist
                 multiprocessors?",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "20",
  number =       "5",
  pages =        "6--12",
  month =        dec,
  year =         "1992",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:19 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kaushal:1992:CHH,
  author =       "R. P. Kaushal and J. S. Bedi",
  title =        "Comparison of hypercube, hypernet, and symmetric
                 hypernet architectures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "20",
  number =       "5",
  pages =        "13--25",
  month =        dec,
  year =         "1992",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:19 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Thorson:1992:UNc,
  author =       "Mark Thorson",
  title =        "{Usenet} Nuggets",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "20",
  number =       "5",
  pages =        "28--33",
  month =        dec,
  year =         "1992",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:19 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Levy:1992:BRN,
  author =       "David Levy",
  title =        "Book review: {{\em Neural Networks and Fuzzy Systems:
                 A Dynamical Systems Approach to Machine
                 Intelligence\/}} by {Bart Kosko (Prentice Hall 1992)}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "20",
  number =       "5",
  pages =        "34--34",
  month =        dec,
  year =         "1992",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:19 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Inoue:1993:PEV,
  author =       "Atsushi Inoue and Kenji Takeda",
  title =        "Performance evaluation for various configuration of
                 superscalar processors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "21",
  number =       "1",
  pages =        "4--11",
  month =        mar,
  year =         "1993",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:33 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Uht:1993:EMIa,
  author =       "Augustus K. Uht",
  title =        "Extraction of massive instruction level parallelism",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "21",
  number =       "1",
  pages =        "12--14",
  month =        mar,
  year =         "1993",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:33 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ullah:1993:MIP,
  author =       "Nasr Ullah and Matt Holle",
  title =        "The {MC88110} implementation of precise exceptions in
                 a superscalar architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "21",
  number =       "1",
  pages =        "15--25",
  month =        mar,
  year =         "1993",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:33 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Deville:1993:PDP,
  author =       "Yannick Deville",
  title =        "A process-dependent partitioning strategy for cache
                 memories",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "21",
  number =       "1",
  pages =        "26--33",
  month =        mar,
  year =         "1993",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:33 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Thorson:1993:UNa,
  author =       "Mark Thorson",
  title =        "{Usenet} Nuggets",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "21",
  number =       "1",
  pages =        "36--38",
  month =        mar,
  year =         "1993",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:33 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Staff:1993:BR,
  author =       "{ACM SIGARCH Computer Architecture News Staff}",
  title =        "Book reviews",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "21",
  number =       "1",
  pages =        "39--39",
  month =        mar,
  year =         "1993",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:33 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Cypher:1993:ARP,
  author =       "R. Cypher and A. Ho and S. Konstantinidou and P.
                 Messina",
  title =        "Architectural requirements of parallel scientific
                 applications with explicit communication",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "21",
  number =       "2",
  pages =        "2--13",
  month =        may,
  year =         "1993",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:46 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Rothberg:1993:WSC,
  author =       "Edward Rothberg and Jaswinder Pal Singh and Anoop
                 Gupta",
  title =        "Working sets, cache sizes, and node granularity issues
                 for large-scale multiprocessors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "21",
  number =       "2",
  pages =        "14--26",
  month =        may,
  year =         "1993",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:46 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Nagle:1993:DTS,
  author =       "David Nagle and Richard Uhlig and Tim Stanley and
                 Stuart Sechrest and Trevor Mudge and Richard Brown",
  title =        "Design tradeoffs for software-managed {TLBs}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "21",
  number =       "2",
  pages =        "27--38",
  month =        may,
  year =         "1993",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:46 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Huck:1993:AST,
  author =       "Jerry Huck and Jim Hays",
  title =        "Architectural support for translation table management
                 in large address space machines",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "21",
  number =       "2",
  pages =        "39--50",
  month =        may,
  year =         "1993",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:46 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Cao:1993:TPR,
  author =       "Pei Cao and Swee Boon Lim and Shivakumar Venkataraman
                 and John Wilkes",
  title =        "The {TickerTAIP} parallel {RAID} architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "21",
  number =       "2",
  pages =        "52--63",
  month =        may,
  year =         "1993",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:46 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Stodolsky:1993:PLO,
  author =       "Daniel Stodolsky and Garth Gibson and Mark Holland",
  title =        "Parity logging overcoming the small write problem in
                 redundant disk arrays",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "21",
  number =       "2",
  pages =        "64--75",
  month =        may,
  year =         "1993",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:46 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Menon:1993:AFT,
  author =       "Jai Menon and Jim Cortney",
  title =        "The architecture of a fault-tolerant cached {RAID}
                 controller",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "21",
  number =       "2",
  pages =        "76--87",
  month =        may,
  year =         "1993",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:46 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Dubois:1993:DEU,
  author =       "Michel Dubois and Jonas Skeppstedt and Livio Ricciulli
                 and Krishnan Ramamurthy and Per Stenstr{\"o}m",
  title =        "The detection and elimination of useless misses in
                 multiprocessors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "21",
  number =       "2",
  pages =        "88--97",
  month =        may,
  year =         "1993",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:46 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Cox:1993:ACC,
  author =       "Alan L. Cox and Robert J. Fowler",
  title =        "Adaptive cache coherency for detecting migratory
                 shared data",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "21",
  number =       "2",
  pages =        "98--108",
  month =        may,
  year =         "1993",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:46 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Stenstrom:1993:ACC,
  author =       "Per Stenstr{\"o}m and Mats Brorsson and Lars
                 Sandberg",
  title =        "An adaptive cache coherence protocol optimized for
                 migratory sharing",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "21",
  number =       "2",
  pages =        "109--118",
  month =        may,
  year =         "1993",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:46 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Waldspurger:1993:RRF,
  author =       "Carl A. Waldspurger and William E. Weihl",
  title =        "Register relocation: flexible contexts for
                 multithreading",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "21",
  number =       "2",
  pages =        "120--130",
  month =        may,
  year =         "1993",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:46 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hidaka:1993:MTC,
  author =       "Yasuo Hidaka and Hanpei Koike and Hidehiko Tanaka",
  title =        "Multiple threads in cyclic register windows",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "21",
  number =       "2",
  pages =        "131--142",
  month =        may,
  year =         "1993",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:46 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Dwarkadas:1993:ERC,
  author =       "Sandhya Dwarkadas and Peter Keleher and Alan L. Cox
                 and Willy Zwaenepoel",
  title =        "Evaluation of release consistent software distributed
                 shared memory on emerging network technology",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "21",
  number =       "2",
  pages =        "144--155",
  month =        may,
  year =         "1993",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:46 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Wood:1993:MCS,
  author =       "David A. Wood and Satish Chandra and Babak Falsafi and
                 Mark D. Hill and James R. Larus and Alvin R. Lebeck and
                 James C. Lewis and Shubhendu S. Mukherjee and Subbarao
                 Palacharla and Steven K. Reinhardt",
  title =        "Mechanisms for cooperative shared memory",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "21",
  number =       "2",
  pages =        "156--167",
  month =        may,
  year =         "1993",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:46 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Seznec:1993:CTW,
  author =       "Andr{\'e} Seznec",
  title =        "A case for two-way skewed-associative caches",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "21",
  number =       "2",
  pages =        "169--178",
  month =        may,
  year =         "1993",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:46 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Agarwal:1993:CAC,
  author =       "Anant Agarwal and Stephen D. Pudar",
  title =        "Column-associative caches: a technique for reducing
                 the miss rate of direct-mapped caches",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "21",
  number =       "2",
  pages =        "179--190",
  month =        may,
  year =         "1993",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:46 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Jouppi:1993:CWP,
  author =       "Norman P. Jouppi",
  title =        "Cache write policies and performance",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "21",
  number =       "2",
  pages =        "191--201",
  month =        may,
  year =         "1993",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:46 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Boyd:1993:HPM,
  author =       "Eric L. Boyd and Edward S. Davidson",
  title =        "Hierarchical performance modeling with {MACS}: a case
                 study of the {Convex C-240}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "21",
  number =       "2",
  pages =        "203--210",
  month =        may,
  year =         "1993",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:46 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kuck:1993:CSI,
  author =       "D. Kuck and E. Davidson and D. Lawrie and A. Sameh and
                 C. Q. Zhu and A. Veidenbaum and J. Konicek and P. Yew
                 and K. Gallivan and W. Jalby and H. Wijshoff and R.
                 Bramley and U. M. Yang and P. Emrath and D. Padua and
                 R. Eigenmann and J. Hoeflinger and G. Jaxon and Z. Li
                 and T. Murphy and J. Andrews",
  title =        "The cedar system and an initial performance study",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "21",
  number =       "2",
  pages =        "213--223",
  month =        may,
  year =         "1993",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:46 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Noakes:1993:JMM,
  author =       "Michael D. Noakes and Deborah A. Wallach and William
                 J. Dally",
  title =        "The {J-machine} multicomputer: an architectural
                 evaluation",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "21",
  number =       "2",
  pages =        "224--235",
  month =        may,
  year =         "1993",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:46 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Bunda:1993:BVB,
  author =       "John Bunda and Don Fussell and W. C. Athas and Roy
                 Jenevein",
  title =        "16-bit vs. 32-bit instructions for pipelined
                 microprocessors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "21",
  number =       "2",
  pages =        "237--246",
  month =        may,
  year =         "1993",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:46 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kiyohara:1993:RCN,
  author =       "Tokuzo Kiyohara and Scott Mahlke and William Chen and
                 Roger Bringmann and Richard Hank and Sadun Anik and
                 Wen-Mei Hwu",
  title =        "Register connection: a new approach to adding
                 registers into instruction set architectures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "21",
  number =       "2",
  pages =        "247--256",
  month =        may,
  year =         "1993",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:46 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Yeh:1993:CDB,
  author =       "Tse-Yu Yeh and Yale N. Patt",
  title =        "A comparison of dynamic branch predictors that use two
                 levels of branch history",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "21",
  number =       "2",
  pages =        "257--266",
  month =        may,
  year =         "1993",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:46 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Barroso:1993:PCC,
  author =       "Luis Andr{\'e} Barroso and Michel Dubois",
  title =        "The performance of cache-coherent ring-based
                 multiprocessors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "21",
  number =       "2",
  pages =        "268--277",
  month =        may,
  year =         "1993",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:46 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Tullsen:1993:LCP,
  author =       "Dean M. Tullsen and Susan J. Eggers",
  title =        "Limitations of cache prefetching on a bus-based
                 multiprocessor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "21",
  number =       "2",
  pages =        "278--288",
  month =        may,
  year =         "1993",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:46 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Herlihy:1993:TMA,
  author =       "Maurice Herlihy and J. Eliot B. Moss",
  title =        "Transactional memory: architectural support for
                 lock-free data structures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "21",
  number =       "2",
  pages =        "289--300",
  month =        may,
  year =         "1993",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:46 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Spertus:1993:EMF,
  author =       "Ellen Spertus and Seth Copen Goldstein and Klaus Erik
                 Schauser and Thorsten von Eicken and David E. Culler
                 and William J. Dally",
  title =        "Evaluation of mechanisms for fine-grained parallel
                 programs in the {J-machine} and the {CM-5}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "21",
  number =       "2",
  pages =        "302--313",
  month =        may,
  year =         "1993",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:46 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Horie:1993:IAP,
  author =       "Takeshi Horie and Kenichi Hayashi and Toshiyuki
                 Shimizu and Hiroaki Ishihata",
  title =        "Improving {AP1000} parallel computer performance with
                 message communication",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "21",
  number =       "2",
  pages =        "314--325",
  month =        may,
  year =         "1993",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:46 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hsu:1993:PCD,
  author =       "W.-C. Hsu and J. E. Smith",
  title =        "Performance of cached {DRAM} organizations in vector
                 supercomputers",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "21",
  number =       "2",
  pages =        "327--336",
  month =        may,
  year =         "1993",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:46 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Gao:1993:CRT,
  author =       "Q. S. Gao",
  title =        "The {Chinese} remainder theorem and the prime memory
                 system",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "21",
  number =       "2",
  pages =        "337--340",
  month =        may,
  year =         "1993",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:46 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Seznec:1993:OMS,
  author =       "Andr{\'e} Seznec and Jacques Lenfant",
  title =        "Odd memory systems may be quite interesting",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "21",
  number =       "2",
  pages =        "341--350",
  month =        may,
  year =         "1993",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:46 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Boppana:1993:CAW,
  author =       "Rajendra V. Boppana and Suresh Chalasani",
  title =        "A comparison of adaptive wormhole routing algorithms",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "21",
  number =       "2",
  pages =        "351--360",
  month =        may,
  year =         "1993",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:46 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Uht:1993:EMIb,
  author =       "Augustus K. Uht",
  title =        "Extraction of massive instruction level parallelism",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "21",
  number =       "3",
  pages =        "5--12",
  month =        jun,
  year =         "1993",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:56 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ramanathan:1993:SCP,
  author =       "Gowri Ramanathan and Joel Oren",
  title =        "Survey of commercial parallel machines",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "21",
  number =       "3",
  pages =        "13--33",
  month =        jun,
  year =         "1993",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:56 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ewy:1993:SCP,
  author =       "Benjamin J. Ewy and Joseph B. Evans",
  title =        "Secondary cache performance in {RISC} architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "21",
  number =       "3",
  pages =        "34--37",
  month =        jun,
  year =         "1993",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:56 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Danesh:1993:PLC,
  author =       "Iraj Danesh",
  title =        "Physical limitations of a computer",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "21",
  number =       "3",
  pages =        "40--45",
  month =        jun,
  year =         "1993",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:56 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Thorson:1993:UNb,
  author =       "Mark Thorson",
  title =        "{Usenet} nuggets",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "21",
  number =       "3",
  pages =        "46--49",
  month =        jun,
  year =         "1993",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:56 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Fostel:1993:BRP,
  author =       "Gary Fostel",
  title =        "Book Reviews: {{\em Principles of Computer Systems\/}}
                 by {Gerald M. Karam \& John C. Bryant (Prentice Hall
                 1992)}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "21",
  number =       "3",
  pages =        "50--51",
  month =        jun,
  year =         "1993",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:56 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Fostel:1993:BRC,
  author =       "Gary Fostel",
  title =        "Book Review: {{\em Computer Architecture\/}} by {Mario
                 De Blasi (Addison-Wesley Publishing Company, 1990)}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "21",
  number =       "3",
  pages =        "51--53",
  month =        jun,
  year =         "1993",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:56 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Fulcher:1993:BRP,
  author =       "John Fulcher",
  title =        "Book Review: {{\em Practical Parallel Computing\/}} by
                 {Paul Messina and Almerico Murli, Editors (John Wiley
                 and Sons, 1992)}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "21",
  number =       "3",
  pages =        "53--54",
  month =        jun,
  year =         "1993",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:56 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hill:1993:WAR,
  author =       "Mark D. Hill and James R. Larus and Alvin R. Lebeck
                 and Madhusudhan Talluri and David A. Wood",
  title =        "{Wisconsin Architectural Research Tool Set}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "21",
  number =       "4",
  pages =        "8--10",
  month =        sep,
  year =         "1993",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:12 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hyatt:1993:HPO,
  author =       "Craig Hyatt",
  title =        "A high-performance object-oriented memory",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "21",
  number =       "4",
  pages =        "11--19",
  month =        sep,
  year =         "1993",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:12 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Dewan:1993:CUM,
  author =       "Gautam Dewan and V. S. S. Nair",
  title =        "A case for uniform memory access multiprocessors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "21",
  number =       "4",
  pages =        "20--26",
  month =        sep,
  year =         "1993",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:12 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Thorson:1993:UNc,
  author =       "Mark Thorson",
  title =        "{Usenet} Nuggets",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "21",
  number =       "4",
  pages =        "27--28",
  month =        sep,
  year =         "1993",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:12 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Langdon:1993:BR,
  author =       "Glen Langdon",
  title =        "Book Reviews",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "21",
  number =       "4",
  pages =        "29--29",
  month =        sep,
  year =         "1993",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:12 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Jain:1993:ISI,
  author =       "Ravi Jain and John Werth and J. C. Browne",
  title =        "Introduction to the {Special Issue on Input\slash
                 Output in Parallel Computer Systems}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "21",
  number =       "5",
  pages =        "5--6",
  month =        dec,
  year =         "1993",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:19 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Corbett:1993:OVP,
  author =       "Peter F. Corbett and Sandra Johnson Baylor and Dror G.
                 Feitelson",
  title =        "Overview of the {Vesta} parallel file system",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "21",
  number =       "5",
  pages =        "7--14",
  month =        dec,
  year =         "1993",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:19 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Lin:1993:PIA,
  author =       "Z. Lin and S. Zhou",
  title =        "Parallelizing {I/O} intensive applications for a
                 workstation cluster: a case study",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "21",
  number =       "5",
  pages =        "15--22",
  month =        dec,
  year =         "1993",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:19 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Fineberg:1993:INA,
  author =       "Samuel A. Fineberg",
  title =        "Implementing the {NHT-1} application {I/O} benchmark",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "21",
  number =       "5",
  pages =        "23--30",
  month =        dec,
  year =         "1993",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:19 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{delRosario:1993:IPT,
  author =       "Juan Miguel del Rosario and Rajesh Bordawekar and Alok
                 Choudhary",
  title =        "Improved parallel {I/O} via a two-phase run-time
                 access strategy",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "21",
  number =       "5",
  pages =        "31--38",
  month =        dec,
  year =         "1993",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:19 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ghandeharizadeh:1993:OTS,
  author =       "Shahram Ghandeharizadeh and Cyrus Shahabi and Luis
                 Ramos",
  title =        "An overview of techniques to support continuous
                 retrieval of multimedia objects",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "21",
  number =       "5",
  pages =        "39--46",
  month =        dec,
  year =         "1993",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:19 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Jain:1993:SPO,
  author =       "Ravi Jain and Kiran Somalwar and John Werth and J. C.
                 Browne",
  title =        "Scheduling parallel {I/O} operations",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "21",
  number =       "5",
  pages =        "47--54",
  month =        dec,
  year =         "1993",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:19 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Li:1993:TTF,
  author =       "Qiang Li and Naphtali Rishe",
  title =        "A transputer {T9000} family based architecture for
                 parallel database machines",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "21",
  number =       "5",
  pages =        "55--62",
  month =        dec,
  year =         "1993",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:19 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Assmann:1993:RPA,
  author =       "Claus A{\ss}mann",
  title =        "A {RISC} processor architecture with a versatile stack
                 system",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "21",
  number =       "5",
  pages =        "63--70",
  month =        dec,
  year =         "1993",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:19 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Wang:1993:NDH,
  author =       "Dajin Wang",
  title =        "A note on {``Diagnosabilities of hypercubes under the
                 pessimistic one-step diagnosis strategy''}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "21",
  number =       "5",
  pages =        "71--78",
  month =        dec,
  year =         "1993",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:19 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Thorson:1993:UNd,
  author =       "Mark Thorson",
  title =        "{Usenet} Nuggets",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "21",
  number =       "5",
  pages =        "79--85",
  month =        dec,
  year =         "1993",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:19 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Alverson:1993:BRH,
  author =       "Bob Alverson",
  title =        "Book Review: {{\em High-Speed Digital Design: A
                 Handbook of Black Magic\/}} by {Howard W. Johnson and
                 Martin Graham (Prentice-Hall, 1993)}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "21",
  number =       "5",
  pages =        "85--86",
  month =        dec,
  year =         "1993",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:19 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Iannucci:1994:AII,
  author =       "Robert Iannucci and Anant Agarwal and Bill Dally and
                 Anoop Gupta and Greg Papadopoulos and Burton Smith",
  title =        "Architectural and implementation issues for
                 multithreading (panel session {I})",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "22",
  number =       "1",
  pages =        "3--18",
  month =        mar,
  year =         "1994",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:34 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Halstead:1994:PCR,
  author =       "Burt Halstead and David Callahan and Jack Dennis and
                 R. S. Nikhil and Vivek Sarkar",
  title =        "Programming, compilation, and resource management
                 issues for multithreading (panel session {II})",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "22",
  number =       "1",
  pages =        "19--33",
  month =        mar,
  year =         "1994",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:34 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Baker:1994:LLP,
  author =       "Henry G. Baker",
  title =        "Linear logic and permutation stacks---the {Forth}
                 shall be first",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "22",
  number =       "1",
  pages =        "34--43",
  month =        mar,
  year =         "1994",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:34 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Mendlson:1994:CTI,
  author =       "Abraham Mendlson and Shlomit S. Pinter and Ruth
                 Shtokhamer",
  title =        "Compile time instruction cache optimizations",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "22",
  number =       "1",
  pages =        "44--51",
  month =        mar,
  year =         "1994",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:34 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Barach:1994:HVF,
  author =       "David Barach and Jaspal Kohli and John Slice and Marc
                 Spaulding and Rajeev Bharadhwaj and Don Hudson and
                 Cliff Neighbors and Nirmal Saxena and Rolland Crunk",
  title =        "{HALSIM}---a very fast {SPARC V9} behavioral model",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "22",
  number =       "1",
  pages =        "52--58",
  month =        mar,
  year =         "1994",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:34 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Thorson:1994:UNa,
  author =       "Mark Thorson",
  title =        "{Usenet} Nuggets",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "22",
  number =       "1",
  pages =        "59--60",
  month =        mar,
  year =         "1994",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:34 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Madruga:1994:BRS,
  author =       "Ewerton Longoni Madruga",
  title =        "Book Review: {{\em SNMP, SNMPv2, and CMIP: The
                 Practical Guide to Network Management Standards\/}} by
                 {William Stallings (Addison-Wesley Publishing Company
                 Inc. 1993)}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "22",
  number =       "1",
  pages =        "60--61",
  month =        mar,
  year =         "1994",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:34 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Calder:1994:FAI,
  author =       "B. Calder and D. Grunwald",
  title =        "Fast and accurate instruction fetch and branch
                 prediction",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "22",
  number =       "2",
  pages =        "2--11",
  month =        apr,
  year =         "1994",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:40 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Talcott:1994:IUB,
  author =       "A. R. Talcott and W. Yamamoto and M. J. Serrano and R.
                 C. Wood and M. Nemirovsky",
  title =        "The impact of unresolved branches on branch prediction
                 scheme performance",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "22",
  number =       "2",
  pages =        "12--21",
  month =        apr,
  year =         "1994",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:40 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Palacharla:1994:ESB,
  author =       "S. Palacharla and R. E. Kessler",
  title =        "Evaluating stream buffers as a secondary cache
                 replacement",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "22",
  number =       "2",
  pages =        "24--33",
  month =        apr,
  year =         "1994",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:40 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Jouppi:1994:TTL,
  author =       "N. P. Jouppi and S. J. E. Wilton",
  title =        "Tradeoffs in two-level on-chip caching",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "22",
  number =       "2",
  pages =        "34--45",
  month =        apr,
  year =         "1994",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:40 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Singhal:1994:ASP,
  author =       "A. Singhal and A. J. Goldberg",
  title =        "Architectural support for performance tuning: a case
                 study on the {SPARCcenter 2000}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "22",
  number =       "2",
  pages =        "48--59",
  month =        apr,
  year =         "1994",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:40 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Cvetanovic:1994:CAA,
  author =       "Z. Cvetanovic and D. Bhandarkar",
  title =        "Characterization of {Alpha AXP} performance using {TP}
                 and {SPEC} workloads",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "22",
  number =       "2",
  pages =        "60--70",
  month =        apr,
  year =         "1994",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:40 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Natarajan:1994:MBC,
  author =       "C. Natarajan and S. Sharma and R. K. Iyer",
  title =        "Measurement-based characterization of global memory
                 and network contention, operating system and
                 parallelization overheads",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "22",
  number =       "2",
  pages =        "71--80",
  month =        apr,
  year =         "1994",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:40 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Joe:1994:EMO,
  author =       "T. Joe and J. L. Hennessy",
  title =        "Evaluating the memory overhead required for {COMA}
                 architectures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "22",
  number =       "2",
  pages =        "82--93",
  month =        apr,
  year =         "1994",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:40 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Klaiber:1994:CMP,
  author =       "A. C. Klaiber and H. M. Levy",
  title =        "A comparison of message passing and shared memory
                 architectures for data parallel programs",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "22",
  number =       "2",
  pages =        "94--105",
  month =        apr,
  year =         "1994",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:40 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Cox:1994:SVH,
  author =       "A. L. Cox and S. Dwarkadas and P. Keleher and H. Lu
                 and R. Rajamony and W. Zwaenepoel",
  title =        "Software versus hardware shared-memory implementation:
                 a case study",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "22",
  number =       "2",
  pages =        "106--117",
  month =        apr,
  year =         "1994",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:40 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Pnevmatikatos:1994:GEB,
  author =       "D. N. Pnevmatikatos and G. S. Sohi",
  title =        "Guarded execution and branch prediction in dynamic
                 {ILP} processors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "22",
  number =       "2",
  pages =        "120--129",
  month =        apr,
  year =         "1994",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:40 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Su:1994:BMS,
  author =       "C.-L Su and A. M. Despain",
  title =        "Branch with masked squashing in superpipelined
                 processors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "22",
  number =       "2",
  pages =        "130--140",
  month =        apr,
  year =         "1994",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:40 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Blumrich:1994:VMM,
  author =       "M. A. Blumrich and K. Li and R. Alpert and C. Dubnicki
                 and E. W. Felten and J. Sandberg",
  title =        "Virtual memory mapped network interface for the
                 {SHRIMP} multicomputer",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "22",
  number =       "2",
  pages =        "142--153",
  month =        apr,
  year =         "1994",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:40 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Steenkiste:1994:AEH,
  author =       "P. Steenkiste and M. Hemy and T. Mummert and B. Zill",
  title =        "Architecture and evaluation of a high-speed networking
                 subsystem for distributed-memory systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "22",
  number =       "2",
  pages =        "154--163",
  month =        apr,
  year =         "1994",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:40 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Nayfeh:1994:EDS,
  author =       "B. A. Nayfeh and K. Olukotun",
  title =        "Exploring the design space for a shared-cache
                 multiprocessor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "22",
  number =       "2",
  pages =        "166--175",
  month =        apr,
  year =         "1994",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:40 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Thekkath:1994:ISB,
  author =       "R. Thekkath and S. J. Eggers",
  title =        "Impact of sharing-based thread placement on
                 multithreaded architectures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "22",
  number =       "2",
  pages =        "176--186",
  month =        apr,
  year =         "1994",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:40 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Dahlgren:1994:CPG,
  author =       "F. Dahlgren and M. Dubois and P. Stenstr{\"o}m",
  title =        "Combined performance gains of simple cache protocol
                 extensions",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "22",
  number =       "2",
  pages =        "187--197",
  month =        apr,
  year =         "1994",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:40 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Huang:1994:SDC,
  author =       "A. S. Huang and G. Slavenburg and J. P. Shen",
  title =        "Speculative disambiguation: a compilation technique
                 for dynamic memory disambiguation",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "22",
  number =       "2",
  pages =        "200--210",
  month =        apr,
  year =         "1994",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:40 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Farkas:1994:CPT,
  author =       "K. I. Farkas and N. P. Jouppi",
  title =        "Complexity\slash performance tradeoffs with
                 non-blocking loads",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "22",
  number =       "2",
  pages =        "211--222",
  month =        apr,
  year =         "1994",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:40 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Chen:1994:PSS,
  author =       "T.-F. Chen and J.-L. Baer",
  title =        "A performance study of software and hardware data
                 prefetching schemes",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "22",
  number =       "2",
  pages =        "223--232",
  month =        apr,
  year =         "1994",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:40 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Drapeau:1994:RIH,
  author =       "A. L. Drapeau and K. W. Shirriff and J. H. Hartman and
                 E. L. Miller and S. Seshan and R. H. Katz and K. Lutz
                 and D. A. Patterson and E. K. Lee and P. M. Chen and G.
                 A. Gibson",
  title =        "{RAID-II}: a high-bandwidth network file server",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "22",
  number =       "2",
  pages =        "234--244",
  month =        apr,
  year =         "1994",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:40 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Blaum:1994:EOS,
  author =       "M. Blaum and J. Brady and J. Bruck and J. Menon",
  title =        "{EVENODD}: an optimal scheme for tolerating double
                 disk failures in {RAID} architectures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "22",
  number =       "2",
  pages =        "245--254",
  month =        apr,
  year =         "1994",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:40 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ng:1994:CDA,
  author =       "S. W. Ng",
  title =        "Crosshatch disk array for improved reliability and
                 performance",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "22",
  number =       "2",
  pages =        "255--264",
  month =        apr,
  year =         "1994",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:40 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{DeHon:1994:MRA,
  author =       "A. DeHon and F. Chong and M. Becker and E. Egozy and
                 H. Minsky and S. Peretz and T. F. {Knight, Jr.}",
  title =        "{METRO}: a router architecture for high-performance,
                 short-haul routing networks",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "22",
  number =       "2",
  pages =        "266--277",
  month =        apr,
  year =         "1994",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:40 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Allen:1994:AAR,
  author =       "J. D. Allen and P. T. Gaughan and D. E. Schimmel and
                 S. Yalamanchili",
  title =        "{Ariadne}---an adaptive router for fault-tolerant
                 multicomputers",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "22",
  number =       "2",
  pages =        "278--288",
  month =        apr,
  year =         "1994",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:40 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kim:1994:CRF,
  author =       "J. H. Kim and Z. Liu and A. A. Chien",
  title =        "Compressionless routing: a framework for adaptive and
                 fault-tolerant routing",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "22",
  number =       "2",
  pages =        "289--300",
  month =        apr,
  year =         "1994",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:40 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kuskin:1994:SFM,
  author =       "J. Kuskin and D. Ofelt and M. Heinrich and J. Heinlein
                 and R. Simoni and K. Gharachorloo and J. Chapin and D.
                 Nakahira and J. Baxter and M. Horowitz and A. Gupta and
                 M. Rosenblum and J. Hennessy",
  title =        "The {Stanford FLASH} multiprocessor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "22",
  number =       "2",
  pages =        "302--313",
  month =        apr,
  year =         "1994",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:40 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Chaiken:1994:SEC,
  author =       "D. Chaiken and A. Agarwal",
  title =        "Software-extended coherent shared memory: performance
                 and cost",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "22",
  number =       "2",
  pages =        "314--324",
  month =        apr,
  year =         "1994",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:40 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Reinhardt:1994:TTU,
  author =       "S. K. Reinhardt and J. R. Larus and D. A. Wood",
  title =        "{Tempest} and {Typhoon}: user-level shared memory",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "22",
  number =       "2",
  pages =        "325--336",
  month =        apr,
  year =         "1994",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:40 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Farrens:1994:SSC,
  author =       "M. Farrens and G. Tyson and A. R. Pleszkun",
  title =        "A study of single-chip processor\slash cache
                 organizations for large numbers of transistors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "22",
  number =       "2",
  pages =        "338--347",
  month =        apr,
  year =         "1994",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:40 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Chen:1994:UAT,
  author =       "C.-H. Chen and A. K. Somani",
  title =        "A unified architectural tradeoff methodology",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "22",
  number =       "2",
  pages =        "348--357",
  month =        apr,
  year =         "1994",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:40 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Nagle:1994:OAC,
  author =       "D. Nagle and R. Uhlig and T. Mudge and S. Sechrest",
  title =        "Optimal allocation of on-chip memory for
                 multiple-{API} operating systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "22",
  number =       "2",
  pages =        "358--369",
  month =        apr,
  year =         "1994",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:40 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Quong:1994:ECM,
  author =       "R. W. Quong",
  title =        "Expected {I-cache} miss rates via the gap model",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "22",
  number =       "2",
  pages =        "372--383",
  month =        apr,
  year =         "1994",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:40 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Seznec:1994:DSC,
  author =       "A. Seznec",
  title =        "Decoupled sectored caches: conciliating low tag
                 implementation cost",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "22",
  number =       "2",
  pages =        "384--393",
  month =        apr,
  year =         "1994",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:40 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Gurd:1994:SBB,
  author =       "J. R. Gurd",
  title =        "Supercomputing: big bang or steady state growth?",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "22",
  number =       "3",
  pages =        "3--13",
  month =        jun,
  year =         "1994",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:57 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Litchfield:1994:IES,
  author =       "Kay P. Litchfield",
  title =        "Instruction execution sequence confirmation",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "22",
  number =       "3",
  pages =        "14--18",
  month =        jun,
  year =         "1994",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:57 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Allen:1994:RWR,
  author =       "Phil Allen and Franc Brglez and Hal Carter and Robert
                 Caverly and Jerry Dillion and Albert Lo and Ron Lomax
                 and John Oldfield and Cesar Pina and T. J. Wilkinson",
  title =        "Report of the {1993 Workshop on Rapid Prototyping of
                 Microelectronic Systems for Universities}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "22",
  number =       "3",
  pages =        "19--26",
  month =        jun,
  year =         "1994",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:57 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Thorson:1994:UNb,
  author =       "Mark Thorson",
  title =        "{Usenet} Nuggets",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "22",
  number =       "3",
  pages =        "27--28",
  month =        jun,
  year =         "1994",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:57 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Madruga:1994:BRI,
  author =       "Ewerton Longoni Madruga",
  title =        "Book Review: {{\em Internetworking with TCP/IP, vol.
                 III: Client-Server programming and applications (BSD
                 Sockets version)\/}} by {Douglas E. Comer and David L.
                 Stevens (Prentice-Hall, 1993)}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "22",
  number =       "3",
  pages =        "29--30",
  month =        jun,
  year =         "1994",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:57 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Jain:1994:SII,
  author =       "Ravi Jain and John Werth and J. C. Browne",
  title =        "{Special Issue on Input\slash Output in Parallel
                 Computer Systems}: {Introduction}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "22",
  number =       "4",
  pages =        "3--4",
  month =        sep,
  year =         "1994",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:12 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Baylor:1994:PEM,
  author =       "Sandra Johnson Baylor and Caroline Benveniste and
                 Yarsun Hsu",
  title =        "Performance evaluation of a massively parallel {I/O}
                 subsystem",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "22",
  number =       "4",
  pages =        "5--10",
  month =        sep,
  year =         "1994",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:12 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Sinclair:1994:IPS,
  author =       "James B. Sinclair and Jay Tang and Peter J. Varman",
  title =        "Instability in parallel {I/O} systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "22",
  number =       "4",
  pages =        "11--16",
  month =        sep,
  year =         "1994",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:12 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Vanderleest:1994:MBC,
  author =       "Steven H. Vanderleest and Ravishankar K. Iyer",
  title =        "Measurement of {I/O} bus contention and correlation
                 among heterogeneous device types in a single-bus
                 multiprocessor system",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "22",
  number =       "4",
  pages =        "17--22",
  month =        sep,
  year =         "1994",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:12 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Thakur:1994:CCD,
  author =       "Rajeev Thakur and Rajesh Bordawekar and Alok
                 Choudhary",
  title =        "Compilation of out-of-core data parallel programs for
                 distributed memory machines",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "22",
  number =       "4",
  pages =        "23--28",
  month =        sep,
  year =         "1994",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:12 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Asthana:1994:EAM,
  author =       "Abhaya Asthana and Mark Cravatts and Paul
                 Krzyzanowski",
  title =        "An experimental active memory based {I/O} subsystem",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "22",
  number =       "4",
  pages =        "29--34",
  month =        sep,
  year =         "1994",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:12 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Durand:1994:DSA,
  author =       "Dannie Durand and Ravi Jain and David Tseytlin",
  title =        "Distributed scheduling algorithms to improve the
                 performance of parallel data transfers",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "22",
  number =       "4",
  pages =        "35--40",
  month =        sep,
  year =         "1994",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:12 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Yokota:1994:DND,
  author =       "Haruo Yokota",
  title =        "{DR-nets}: data-reconstruction networks for highly
                 reliable parallel-disk systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "22",
  number =       "4",
  pages =        "41--46",
  month =        sep,
  year =         "1994",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:12 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Forsell:1994:MMPa,
  author =       "Martti J. Forsell",
  title =        "Are multiport memories physically feasible?",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "22",
  number =       "4",
  pages =        "47--54",
  month =        sep,
  year =         "1994",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:12 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Chaudhry:1994:CMP,
  author =       "Ghulam Chaudhry and Xuechang Li",
  title =        "A case for the multithreaded processor architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "22",
  number =       "4",
  pages =        "55--59",
  month =        sep,
  year =         "1994",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:12 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Chan:1994:ECF,
  author =       "Yin Chan and Ashok Sudarsanam and Andrew Wolfe",
  title =        "The effect of compiler-flag tuning on {SPEC} benchmark
                 performance",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "22",
  number =       "4",
  pages =        "60--70",
  month =        sep,
  year =         "1994",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:12 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Lee:1994:RCC,
  author =       "Jin-Ho Lee and Min-Young Lee and Seong-Uk Choi and
                 Myong-Soon Park",
  title =        "Reducing cache conflicts in data cache prefetching",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "22",
  number =       "4",
  pages =        "71--77",
  month =        sep,
  year =         "1994",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:12 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Thorson:1994:UNc,
  author =       "Mark Thorson",
  title =        "{Usenet} Nuggets",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "22",
  number =       "4",
  pages =        "78--81",
  month =        sep,
  year =         "1994",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:12 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Forsell:1994:MMPb,
  author =       "Martti J. Forsell",
  title =        "Are multiport memories physically feasible?",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "22",
  number =       "5",
  pages =        "3--10",
  month =        dec,
  year =         "1994",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:20 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Sosic:1994:HCH,
  author =       "Rok Sosi{\v{c}}",
  title =        "History cache: hardware support for reverse
                 execution",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "22",
  number =       "5",
  pages =        "11--18",
  month =        dec,
  year =         "1994",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:20 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hill:1994:WWT,
  author =       "Mark D. Hill and James R. Larus and David A. Wood",
  title =        "The {Wisconsin Wind Tunnel} project: an annotated
                 bibliography",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "22",
  number =       "5",
  pages =        "19--26",
  month =        dec,
  year =         "1994",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:20 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Saha:1994:DDT,
  author =       "Avijit Saha and Nadeem Malik",
  title =        "Distributed directory tags",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "22",
  number =       "5",
  pages =        "27--29",
  month =        dec,
  year =         "1994",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:20 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Unwala:1994:SMP,
  author =       "Ishaq H. Unwala and Harvey G. Cragon",
  title =        "A study of {MIPS} programs",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "22",
  number =       "5",
  pages =        "30--40",
  month =        dec,
  year =         "1994",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:20 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Thorson:1994:IN,
  author =       "Mark Thorson",
  title =        "{Internet} Nuggets",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "22",
  number =       "5",
  pages =        "41--46",
  month =        dec,
  year =         "1994",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:20 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ohnemus:1994:BIL,
  author =       "Kenneth R. Ohnemus and Diana F. Mallin",
  title =        "Benefits of implementing on-line methods and
                 procedures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "22",
  number =       "5",
  pages =        "49--55",
  month =        dec,
  year =         "1994",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:20 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Cunningham:1994:LDT,
  author =       "Daniel K. Cunningham and Steven J. Reilly",
  title =        "Leading the design team---the evolution of the
                 technical writer from a support role to a design role",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "22",
  number =       "5",
  pages =        "56--60",
  month =        dec,
  year =         "1994",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:20 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Rockley:1994:MTE,
  author =       "Ann Rockley",
  title =        "Multimedia: towards an electronic performance support
                 system",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "22",
  number =       "5",
  pages =        "61--65",
  month =        dec,
  year =         "1994",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:20 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Drew:1994:TTM,
  author =       "Katherine E. Drew",
  title =        "Telecommunicators and telecommuters: making
                 multiple-site documentation projects work",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "22",
  number =       "5",
  pages =        "66--75",
  month =        dec,
  year =         "1994",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:20 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Severson:1995:TCP,
  author =       "Aimee Severson and Brent Nelson",
  title =        "Throughput in a counterflow pipeline processor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "23",
  number =       "1",
  pages =        "5--12",
  month =        mar,
  year =         "1995",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:34 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hsu:1995:SAC,
  author =       "Tsong-Chih Hsu and Sheng-De Wang",
  title =        "A simple architecture for constant time sorting
                 machines",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "23",
  number =       "1",
  pages =        "13--19",
  month =        mar,
  year =         "1995",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:34 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Wulf:1995:HMW,
  author =       "Wm. A. Wulf and Sally A. McKee",
  title =        "Hitting the memory wall: implications of the obvious",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "23",
  number =       "1",
  pages =        "20--24",
  month =        mar,
  year =         "1995",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:34 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Thorson:1995:INa,
  author =       "Mark Thorson",
  title =        "{Internet} Nuggets",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "23",
  number =       "1",
  pages =        "25--28",
  month =        mar,
  year =         "1995",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:34 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Agarwal:1995:AMA,
  author =       "Anant Agarwal and Ricardo Bianchini and David Chaiken
                 and Kirk L. Johnson and David Kranz and John
                 Kubiatowicz and Beng-Hong Lim and Kenneth Mackenzie and
                 Donald Yeung",
  title =        "The {MIT Alewife} machine: architecture and
                 performance",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "23",
  number =       "2",
  pages =        "2--13",
  month =        may,
  year =         "1995",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:47 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kodama:1995:EXP,
  author =       "Yuetsu Kodama and Hirohumi Sakane and Mitsuhisa Sato
                 and Hayato Yamana and Shuichi Sakai and Yoshinori
                 Yamaguchi",
  title =        "The {EM-X} parallel computer: architecture and basic
                 performance",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "23",
  number =       "2",
  pages =        "14--23",
  month =        may,
  year =         "1995",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:47 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Woo:1995:SPC,
  author =       "Steven Cameron Woo and Moriyoshi Ohara and Evan Torrie
                 and Jaswinder Pal Singh and Anoop Gupta",
  title =        "The {SPLASH-2} programs: characterization and
                 methodological considerations",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "23",
  number =       "2",
  pages =        "24--36",
  month =        may,
  year =         "1995",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:47 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Grahn:1995:ESS,
  author =       "H{\aa}kan Grahn and Per Stenstr{\"o}m",
  title =        "Efficient strategies for software-only protocols in
                 shared-memory multiprocessors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "23",
  number =       "2",
  pages =        "38--47",
  month =        may,
  year =         "1995",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:47 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Lebeck:1995:DSI,
  author =       "Alvin R. Lebeck and David A. Wood",
  title =        "Dynamic self-invalidation: reducing coherence overhead
                 in shared-memory multiprocessors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "23",
  number =       "2",
  pages =        "48--59",
  month =        may,
  year =         "1995",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:47 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Dahlgren:1995:BPH,
  author =       "Fredrik Dahlgren",
  title =        "Boosting the performance of hybrid snooping cache
                 protocols",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "23",
  number =       "2",
  pages =        "60--69",
  month =        may,
  year =         "1995",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:47 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Nowatzyk:1995:CNW,
  author =       "Andreas G. Nowatzyk and Michael C. Browne and Edmund
                 J. Kelly and Michael Parkin",
  title =        "{S}-connect: from networks of workstations to
                 supercomputer performance",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "23",
  number =       "2",
  pages =        "71--82",
  month =        may,
  year =         "1995",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:47 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Varma:1995:DAD,
  author =       "Anujan Varma and Quinn Jacobson",
  title =        "Destage algorithms for disk arrays with non-volatile
                 caches",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "23",
  number =       "2",
  pages =        "83--95",
  month =        may,
  year =         "1995",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:47 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Stoll:1995:EMP,
  author =       "Gordon Stoll and Bin Wei and Douglas Clark and Edward
                 W. Felten and Kai Li and Patrick Hanrahan",
  title =        "Evaluating multi-port frame buffer designs for a
                 mesh-connected multicomputer",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "23",
  number =       "2",
  pages =        "96--105",
  month =        may,
  year =         "1995",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:47 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Nowatzyk:1995:CRD,
  author =       "Andreas G. Nowatzyk and Paul R. Prucnal",
  title =        "Are crossbars really dead?: the case for optical
                 multiprocessor interconnect systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "23",
  number =       "2",
  pages =        "106--115",
  month =        may,
  year =         "1995",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:47 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Jourdan:1995:ECF,
  author =       "St{\'e}phan Jourdan and Pascal Sainrat and Daniel
                 Litaize",
  title =        "Exploring configurations of functional units in an
                 out-of-order superscalar processor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "23",
  number =       "2",
  pages =        "117--125",
  month =        may,
  year =         "1995",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:47 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ando:1995:USE,
  author =       "Hideki Ando and Chikako Nakanishi and Tetsuya Hara and
                 Masao Nakaya",
  title =        "Unconstrained speculative execution with predicated
                 state buffering",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "23",
  number =       "2",
  pages =        "126--137",
  month =        may,
  year =         "1995",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:47 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Mahlke:1995:CFP,
  author =       "Scott A. Mahlke and Richard E. Hank and James E.
                 McCormick and David I. August and Wen-Mei W. Hwu",
  title =        "A comparison of full and partial predicated execution
                 support for {ILP} processors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "23",
  number =       "2",
  pages =        "138--150",
  month =        may,
  year =         "1995",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:47 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Simone:1995:ITO,
  author =       "M. Simone and A. Essen and A. Ike and A.
                 Krishnamoorthy and T. Maruyama and N. Patkar and M.
                 Ramaswami and M. Shebanow and V. Thirumalaiswamy and D.
                 Tovey",
  title =        "Implementation trade-offs in using a restricted data
                 flow architecture in a high performance {RISC}
                 microprocessor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "23",
  number =       "2",
  pages =        "151--162",
  month =        may,
  year =         "1995",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:47 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Diep:1995:PEP,
  author =       "Trung A. Diep and Christopher Nelson and John Paul
                 Shen",
  title =        "Performance evaluation of the {PowerPC 620}
                 microarchitecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "23",
  number =       "2",
  pages =        "163--174",
  month =        may,
  year =         "1995",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:47 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Romer:1995:RTM,
  author =       "Theodore H. Romer and Wayne H. Ohlrich and Anna R.
                 Karlin and Brian N. Bershad",
  title =        "Reducing {TLB} and memory overhead using online
                 superpage promotion",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "23",
  number =       "2",
  pages =        "176--187",
  month =        may,
  year =         "1995",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:47 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Zhang:1995:SIA,
  author =       "Zheng Zhang and Josep Torrellas",
  title =        "Speeding up irregular applications in shared-memory
                 multiprocessors: memory binding and group prefetching",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "23",
  number =       "2",
  pages =        "188--199",
  month =        may,
  year =         "1995",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:47 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Anjan:1995:EFA,
  author =       "K. V. Anjan and Timothy Mark Pinkston",
  title =        "An efficient, fully adaptive deadlock recovery scheme:
                 {DISHA}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "23",
  number =       "2",
  pages =        "201--210",
  month =        may,
  year =         "1995",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:47 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Shin:1995:AIH,
  author =       "Kang G. Shin and Stuart W. Daniel",
  title =        "Analysis and implementation of hybrid switching",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "23",
  number =       "2",
  pages =        "211--219",
  month =        may,
  year =         "1995",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:47 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Dao:1995:CFC,
  author =       "Binh Vien Dao and Jose Duato and Sudhakar
                 Yalamanchili",
  title =        "Configurable flow control mechanisms for
                 fault-tolerant routing",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "23",
  number =       "2",
  pages =        "220--229",
  month =        may,
  year =         "1995",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:47 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Callahan:1995:NLO,
  author =       "Timothy Callahan and Seth Copen Goldstein",
  title =        "{NIFDY}: a low overhead, high throughput network
                 interface",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "23",
  number =       "2",
  pages =        "230--241",
  month =        may,
  year =         "1995",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:47 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Peiron:1995:VMA,
  author =       "Montse Peiron and Mateo Valero and Eduard Ayguad{\'e}
                 and Tom{\'a}s Lang",
  title =        "Vector multiprocessors with arbitrated memory access",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "23",
  number =       "2",
  pages =        "243--252",
  month =        may,
  year =         "1995",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:47 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kavi:1995:DCM,
  author =       "Krishna M. Kavi and A. R. Hurson and Phenil Patadia
                 and Elizabeth Abraham and Ponnarasu Shanmugam",
  title =        "Design of cache memories for multi-threaded dataflow
                 architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "23",
  number =       "2",
  pages =        "253--264",
  month =        may,
  year =         "1995",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:47 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Bodin:1995:SAE,
  author =       "Fran{\c{c}}ois Bodin and Andr{\'e} Seznec",
  title =        "Skewed associativity enhances performance
                 predictability",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "23",
  number =       "2",
  pages =        "265--274",
  month =        may,
  year =         "1995",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:47 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Young:1995:CAS,
  author =       "Cliff Young and Nicolas Gloy and Michael D. Smith",
  title =        "A comparative analysis of schemes for correlated
                 branch prediction",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "23",
  number =       "2",
  pages =        "276--286",
  month =        may,
  year =         "1995",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:47 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Calder:1995:NCL,
  author =       "Brad Calder and Dirk Grunwald",
  title =        "Next cache line and set prediction",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "23",
  number =       "2",
  pages =        "287--296",
  month =        may,
  year =         "1995",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:47 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Karamcheti:1995:CAS,
  author =       "Vijay Karamcheti and Andrew A. Chien",
  title =        "A comparison of architectural support for messaging in
                 the {TMC CM-5} and the {Cray T3D}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "23",
  number =       "2",
  pages =        "298--307",
  month =        may,
  year =         "1995",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:47 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Stricker:1995:OMS,
  author =       "T. Stricker and T. Gross",
  title =        "Optimizing memory system performance for communication
                 in parallel computers",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "23",
  number =       "2",
  pages =        "308--319",
  month =        may,
  year =         "1995",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:47 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Arpaci:1995:EEC,
  author =       "Remzi H. Arpaci and David E. Culler and Arvind
                 Krishnamurthy and Steve G. Steinberg and Katherine
                 Yelick",
  title =        "Empirical evaluation of the {CRAY-T$3$D}: a compiler
                 perspective",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "23",
  number =       "2",
  pages =        "320--331",
  month =        may,
  year =         "1995",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:47 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Conte:1995:OIF,
  author =       "Thomas M. Conte and Kishore N. Menezes and Patrick M.
                 Mills and Burzin A. Patel",
  title =        "Optimization of instruction fetch mechanisms for high
                 issue rates",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "23",
  number =       "2",
  pages =        "333--344",
  month =        may,
  year =         "1995",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:47 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Uhlig:1995:IFC,
  author =       "Richard Uhlig and David Nagle and Trevor Mudge and
                 Stuart Sechrest and Joel Emer",
  title =        "Instruction fetching: coping with code bloat",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "23",
  number =       "2",
  pages =        "345--356",
  month =        may,
  year =         "1995",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:47 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Lee:1995:ICF,
  author =       "Dennis Lee and Jean-Loup Baer and Brad Calder and Dirk
                 Grunwald",
  title =        "Instruction cache fetch policies for speculative
                 execution",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "23",
  number =       "2",
  pages =        "357--367",
  month =        may,
  year =         "1995",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:47 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Austin:1995:SDC,
  author =       "Todd M. Austin and Dionisios N. Pnevmatikatos and
                 Gurindar S. Sohi",
  title =        "Streamlining data cache access with fast address
                 calculation",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "23",
  number =       "2",
  pages =        "369--380",
  month =        may,
  year =         "1995",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:47 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Wang:1995:CCA,
  author =       "Hong Wang and Tong Sun and Qing Yang",
  title =        "{CAT}---caching address tags: a technique for reducing
                 area cost of on-chip caches",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "23",
  number =       "2",
  pages =        "381--390",
  month =        may,
  year =         "1995",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:47 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Tullsen:1995:SMM,
  author =       "Dean M. Tullsen and Susan J. Eggers and Henry M.
                 Levy",
  title =        "Simultaneous multithreading: maximizing on-chip
                 parallelism",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "23",
  number =       "2",
  pages =        "392--403",
  month =        may,
  year =         "1995",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:47 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ho:1995:AVP,
  author =       "Richard C. Ho and C. Han Yang and Mark A. Horowitz and
                 David L. Dill",
  title =        "Architecture validation for processors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "23",
  number =       "2",
  pages =        "404--413",
  month =        may,
  year =         "1995",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:47 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Sohi:1995:MP,
  author =       "Gurindar S. Sohi and Scott E. Breach and T. N.
                 Vijaykumar",
  title =        "Multiscalar processors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "23",
  number =       "2",
  pages =        "414--425",
  month =        may,
  year =         "1995",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:47 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Beckmann:1995:HPM,
  author =       "Carl J. Beckmann",
  title =        "{HTGL}: a program modelling language",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "23",
  number =       "3",
  pages =        "3--10",
  month =        jun,
  year =         "1995",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:57 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Lafitte:1995:SDH,
  author =       "Jean-Louis Lafitte",
  title =        "On structured data handling in parallel processing",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "23",
  number =       "3",
  pages =        "11--18",
  month =        jun,
  year =         "1995",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:57 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ulmann:1995:ESB,
  author =       "B. Ulmann",
  title =        "{o$ \mu $-EP-1}: a simple 32-bit architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "23",
  number =       "3",
  pages =        "19--24",
  month =        jun,
  year =         "1995",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:57 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Thorson:1995:INb,
  author =       "Mark Thorson",
  title =        "{Internet} Nuggets",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "23",
  number =       "3",
  pages =        "25--27",
  month =        jun,
  year =         "1995",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:57 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Tabak:1995:CMH,
  author =       "Daniel Tabak",
  title =        "{{\em Cache and Memory Hierarchy Design: A
                 Performance-Directed Approach\/}} by {Steven A.
                 Przybylski}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "23",
  number =       "3",
  pages =        "28--28",
  month =        jun,
  year =         "1995",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:57 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Wilkes:1995:MWC,
  author =       "Maurice V. Wilkes",
  title =        "The memory wall and the {CMOS} end-point",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "23",
  number =       "4",
  pages =        "4--6",
  month =        sep,
  year =         "1995",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:13 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Johnson:1995:GMW,
  author =       "Eric E. Johnson",
  title =        "Graffiti on ``the memory wall''",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "23",
  number =       "4",
  pages =        "7--8",
  month =        sep,
  year =         "1995",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:13 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Afzal:1995:PMU,
  author =       "Tariq Afzal",
  title =        "Performance modeling using the {Motorola PowerPC}
                 timing simulator",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "23",
  number =       "4",
  pages =        "9--18",
  month =        sep,
  year =         "1995",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:13 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Parhami:1995:SMD,
  author =       "Behrooz Parhami",
  title =        "{SIMD} machines: do they have a significant future?",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "23",
  number =       "4",
  pages =        "19--22",
  month =        sep,
  year =         "1995",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:13 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Jain:1995:AAE,
  author =       "Ravi Jain and John Werth",
  title =        "Airdisks and {airRAID} (expanded extract): modeling
                 and scheduling periodic wireless data broadcast",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "23",
  number =       "4",
  pages =        "23--28",
  month =        sep,
  year =         "1995",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:13 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kontothanassis:1995:ESM,
  author =       "Leonidas I. Kontothanassis and Michael L. Scott",
  title =        "Efficient shared memory with minimal hardware
                 support",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "23",
  number =       "4",
  pages =        "29--35",
  month =        sep,
  year =         "1995",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:13 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Gschwind:1995:VP,
  author =       "Michael K. Gschwind and Thomas J. Pietsch",
  title =        "Vector prefetching",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "23",
  number =       "5",
  pages =        "1--7",
  month =        dec,
  year =         "1995",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:20 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Karne:1995:OOC,
  author =       "Ramesh K. Karne",
  title =        "Object-oriented computer architectures for new
                 generation of applications",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "23",
  number =       "5",
  pages =        "8--19",
  month =        dec,
  year =         "1995",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:20 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Khalid:1995:URA,
  author =       "Humayun Khalid",
  title =        "The unconventional replacement algorithms",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "23",
  number =       "5",
  pages =        "20--26",
  month =        dec,
  year =         "1995",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:20 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Khalid:1995:TDS,
  author =       "Humayun Khalid",
  title =        "A trace-driven simulation methodology",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "23",
  number =       "5",
  pages =        "27--33",
  month =        dec,
  year =         "1995",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:20 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Mirghafori:1995:TSB,
  author =       "Nikki Mirghafori and Margret Jacoby and David
                 Patterson",
  title =        "Truth in {SPEC} benchmarks",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "23",
  number =       "5",
  pages =        "34--42",
  month =        dec,
  year =         "1995",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:20 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Thorson:1995:INc,
  author =       "Mark Thorson",
  title =        "{Internet} Nuggets",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "23",
  number =       "5",
  pages =        "43--44",
  month =        dec,
  year =         "1995",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:20 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Mudge:1996:RPH,
  author =       "Trevor Mudge",
  title =        "Report on the panel: {``How Can Computer Architecture
                 Researchers Avoid Becoming the Society for
                 Irreproducible Results?''}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "24",
  number =       "1",
  pages =        "1--5",
  month =        mar,
  year =         "1996",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:34 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kwon:1996:COR,
  author =       "Oh-Young Kwon and Gi-Ho Park and Tack-Don Han",
  title =        "A compiler optimization to reduce execution time of
                 loop nest",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "24",
  number =       "1",
  pages =        "6--11",
  month =        mar,
  year =         "1996",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:34 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Thorson:1996:INa,
  author =       "Mark Thorson",
  title =        "{Internet} Nuggets",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "24",
  number =       "1",
  pages =        "12--16",
  month =        mar,
  year =         "1996",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:34 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Tabak:1996:BRA,
  author =       "Daniel Tabak",
  title =        "Book Review: {{\em Alpha Implementations and
                 Architecture\/}} by {Dileep P. Bhandarkar}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "24",
  number =       "1",
  pages =        "17--18",
  month =        mar,
  year =         "1996",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:34 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Evers:1996:UHB,
  author =       "Marius Evers and Po-Yung Chang and Yale N. Patt",
  title =        "Using hybrid branch predictors to improve branch
                 prediction accuracy in the presence of context
                 switches",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "24",
  number =       "2",
  pages =        "3--11",
  month =        may,
  year =         "1996",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:47 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Gloy:1996:ADB,
  author =       "Nicolas Gloy and Cliff Young and J. Bradley Chen and
                 Michael D. Smith",
  title =        "An analysis of dynamic branch prediction schemes on
                 system workloads",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "24",
  number =       "2",
  pages =        "12--21",
  month =        may,
  year =         "1996",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:47 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Sechrest:1996:CAD,
  author =       "Stuart Sechrest and Chih-Chieh Lee and Trevor Mudge",
  title =        "Correlation and aliasing in dynamic branch
                 predictors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "24",
  number =       "2",
  pages =        "22--32",
  month =        may,
  year =         "1996",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:47 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Reinhardt:1996:DHS,
  author =       "Steven K. Reinhardt and Robert W. Pfile and David A.
                 Wood",
  title =        "Decoupled hardware support for distributed shared
                 memory",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "24",
  number =       "2",
  pages =        "34--43",
  month =        may,
  year =         "1996",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:47 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Yeung:1996:MMS,
  author =       "Donald Yeung and John Kubiatowicz and Anant Agarwal",
  title =        "{MGS}: a multigrain shared memory system",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "24",
  number =       "2",
  pages =        "44--55",
  month =        may,
  year =         "1996",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:47 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Morin:1996:COB,
  author =       "Christine Morin and Alain Gefflaut and Michel
                 Ban{\^a}tre and Anne-Marie Kermarrec",
  title =        "{COMA}: an opportunity for building fault-tolerant
                 scalable shared memory multiprocessors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "24",
  number =       "2",
  pages =        "56--65",
  month =        may,
  year =         "1996",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:47 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Nayfeh:1996:EDA,
  author =       "Basem A. Nayfeh and Lance Hammond and Kunle Olukotun",
  title =        "Evaluation of design alternatives for a multiprocessor
                 microprocessor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "24",
  number =       "2",
  pages =        "67--77",
  month =        may,
  year =         "1996",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:47 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Burger:1996:MBL,
  author =       "Doug Burger and James R. Goodman and Alain K{\"a}gi",
  title =        "Memory bandwidth limitations of future
                 microprocessors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "24",
  number =       "2",
  pages =        "78--89",
  month =        may,
  year =         "1996",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:47 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Saulsbury:1996:MMW,
  author =       "Ashley Saulsbury and Fong Pong and Andreas Nowatzyk",
  title =        "Missing the memory wall: the case for processor\slash
                 memory integration",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "24",
  number =       "2",
  pages =        "90--101",
  month =        may,
  year =         "1996",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:47 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Seznec:1996:DUP,
  author =       "Andr{\'e} Seznec",
  title =        "Don't use the page number, but a pointer to it",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "24",
  number =       "2",
  pages =        "104--113",
  month =        may,
  year =         "1996",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:47 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Juan:1996:DBC,
  author =       "Toni Juan and Tom{\'a}s Lang and Juan J. Navarro",
  title =        "The difference-bit cache",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "24",
  number =       "2",
  pages =        "114--120",
  month =        may,
  year =         "1996",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:47 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Iftode:1996:UAP,
  author =       "Liviu Iftode and Jaswinder Pal Singh and Kai Li",
  title =        "Understanding application performance on shared
                 virtual memory systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "24",
  number =       "2",
  pages =        "122--133",
  month =        may,
  year =         "1996",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:47 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Holt:1996:AAB,
  author =       "Chris Holt and Jaswinder Pal Singh and John Hennessy",
  title =        "Application and architectural bottlenecks in large
                 scale distributed shared memory machines",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "24",
  number =       "2",
  pages =        "134--145",
  month =        may,
  year =         "1996",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:47 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Wilson:1996:ICP,
  author =       "Kenneth M. Wilson and Kunle Olukotun and Mendel
                 Rosenblum",
  title =        "Increasing cache port efficiency for dynamic
                 superscalar microprocessors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "24",
  number =       "2",
  pages =        "147--157",
  month =        may,
  year =         "1996",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:47 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Austin:1996:HBA,
  author =       "Todd M. Austin and Gurindar S. Sohi",
  title =        "High-bandwidth address translation for multiple-issue
                 processors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "24",
  number =       "2",
  pages =        "158--167",
  month =        may,
  year =         "1996",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:47 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hu:1996:DDC,
  author =       "Yiming Hu and Qing Yang",
  title =        "{DCD}---disk caching disk: a new approach for boosting
                 {I/O} performance",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "24",
  number =       "2",
  pages =        "169--178",
  month =        may,
  year =         "1996",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:47 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Maquelin:1996:PWC,
  author =       "Olivier Maquelin and Guang R. Gao and Herbert H. J.
                 Hum and Kevin B. Theobald and Xin-Min Tian",
  title =        "Polling watchdog: combining polling and interrupts for
                 efficient message handling",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "24",
  number =       "2",
  pages =        "179--188",
  month =        may,
  year =         "1996",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:47 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Tullsen:1996:ECI,
  author =       "Dean M. Tullsen and Susan J. Eggers and Joel S. Emer
                 and Henry M. Levy and Jack L. Lo and Rebecca L. Stamm",
  title =        "Exploiting choice: instruction fetch and issue on an
                 implementable simultaneous multithreading processor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "24",
  number =       "2",
  pages =        "191--202",
  month =        may,
  year =         "1996",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:47 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Eickemeyer:1996:EMU,
  author =       "Richard J. Eickemeyer and Ross E. Johnson and Steven
                 R. Kunkel and Mark S. Squillante and Shiafun Liu",
  title =        "Evaluation of multithreaded uniprocessors for
                 commercial application environments",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "24",
  number =       "2",
  pages =        "203--212",
  month =        may,
  year =         "1996",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:47 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hara:1996:PCI,
  author =       "Tetsuya Hara and Hideki Ando and Chikako Nakanishi and
                 Masao Nakaya",
  title =        "Performance comparison of {ILP} machines with cycle
                 time evaluation",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "24",
  number =       "2",
  pages =        "213--224",
  month =        may,
  year =         "1996",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:47 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kim:1996:RCQ,
  author =       "Jae H. Kim and Andrew A. Chien",
  title =        "Rotating combined queueing {(RCQ)}: bandwidth and
                 latency guarantees in low-cost, high-performance
                 networks",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "24",
  number =       "2",
  pages =        "226--236",
  month =        may,
  year =         "1996",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:47 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Rexford:1996:RAR,
  author =       "Jennifer Rexford and John Hall and Kang G. Shin",
  title =        "A router architecture for real-time point-to-point
                 networks",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "24",
  number =       "2",
  pages =        "237--246",
  month =        may,
  year =         "1996",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:47 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Mukherjee:1996:CNI,
  author =       "Shubhendu S. Mukherjee and Babak Falsafi and Mark D.
                 Hill and David A. Wood",
  title =        "Coherent network interfaces for fine-grain
                 communication",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "24",
  number =       "2",
  pages =        "247--258",
  month =        may,
  year =         "1996",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:47 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Horowitz:1996:IMO,
  author =       "Mark Horowitz and Margaret Martonosi and Todd C. Mowry
                 and Michael D. Smith",
  title =        "Informing memory operations: providing memory
                 performance feedback in modern processors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "24",
  number =       "2",
  pages =        "260--270",
  month =        may,
  year =         "1996",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:47 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Xia:1996:IPS,
  author =       "Chun Xia and Josep Torrellas",
  title =        "Instruction prefetching of systems codes with layout
                 optimized for reduced cache misses",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "24",
  number =       "2",
  pages =        "271--282",
  month =        may,
  year =         "1996",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:47 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Choi:1996:CHS,
  author =       "Lynn Choi and Pen-Chung Yew",
  title =        "Compiler and hardware support for cache coherence in
                 large-scale multiprocessors: design considerations and
                 performance study",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "24",
  number =       "2",
  pages =        "283--294",
  month =        may,
  year =         "1996",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:47 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Felten:1996:EEM,
  author =       "Edward W. Felten and Richard D. Alpert and Angelos
                 Bilas and Matthias A. Blumrich and Douglas W. Clark and
                 Stefanos N. Damianakis and Cezary Dubnicki and Liviu
                 Iftode and Kai Li",
  title =        "Early experience with message-passing on the {SHRIMP}
                 multicomputer",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "24",
  number =       "2",
  pages =        "296--307",
  month =        may,
  year =         "1996",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:47 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Lovett:1996:SCN,
  author =       "Tom Lovett and Russell Clapp",
  title =        "{STiNG}: a {CC-NUMA} computer system for the
                 commercial marketplace",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "24",
  number =       "2",
  pages =        "308--317",
  month =        may,
  year =         "1996",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:47 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Carretero:1996:MPD,
  author =       "J. Carretero and F. P{\'e}rez and P. de Miguel and F.
                 Garc{\'\i}a and L. Alonso",
  title =        "A massively parallel and distributed {I/O} subsystem",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "24",
  number =       "3",
  pages =        "1--8",
  month =        jun,
  year =         "1996",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:58 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ligon:1996:DLB,
  author =       "W. B. {Ligon III} and Daniel C. {Stanzione, Jr.}",
  title =        "Distributing and load-balancing for loops in
                 scientific applications",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "24",
  number =       "3",
  pages =        "9--17",
  month =        jun,
  year =         "1996",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:58 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Belayneh:1996:DNBa,
  author =       "Samson Belayneh and David R. Kaeli",
  title =        "A discussion on non-blocking\slash lockup-free
                 caches",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "24",
  number =       "3",
  pages =        "18--25",
  month =        jun,
  year =         "1996",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:58 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Thorson:1996:INb,
  author =       "Mark Thorson",
  title =        "{Internet} Nuggets",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "24",
  number =       "3",
  pages =        "26--32",
  month =        jun,
  year =         "1996",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:58 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Paez-Monzon:1996:RPD,
  author =       "Gerard P{\'a}ez-Monz{\'o}n and Charles
                 P{\'a}ez-Monz{\'o}n",
  title =        "The {RISC} processor {DMN-6}: a unified data-control
                 flow architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "24",
  number =       "4",
  pages =        "3--10",
  month =        sep,
  year =         "1996",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:13 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Pulido:1996:ETT,
  author =       "J. A. G{\'o}mez Pulido and J. M. S{\'a}nchez P{\'e}rez
                 and J. A. Moreno Zamora",
  title =        "An educational tool for testing hierarchical
                 multilevel caches",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "24",
  number =       "4",
  pages =        "11--15",
  month =        sep,
  year =         "1996",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:13 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Belayneh:1996:DNBb,
  author =       "Samson Belayneh and David R. Kaeli",
  title =        "A discussion on non-blocking\slash lockup-free
                 caches",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "24",
  number =       "4",
  pages =        "16--16",
  month =        sep,
  year =         "1996",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:13 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Rosenbaum:1996:AP,
  author =       "Mark Rosenbaum",
  title =        "Architectural potholes",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "24",
  number =       "4",
  pages =        "17--18",
  month =        sep,
  year =         "1996",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:13 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Mashey:1996:AP,
  author =       "John Mashey",
  title =        "Architectural potholes",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "24",
  number =       "4",
  pages =        "18--18",
  month =        sep,
  year =         "1996",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:13 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Cockcroft:1996:P,
  author =       "Adrian Cockcroft",
  title =        "{I/O} potholes",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "24",
  number =       "4",
  pages =        "18--19",
  month =        sep,
  year =         "1996",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:13 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ebrahim:1996:P,
  author =       "Zahir Ebrahim",
  title =        "{I/O} potholes",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "24",
  number =       "4",
  pages =        "19--20",
  month =        sep,
  year =         "1996",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:13 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Carlile:1996:IB,
  author =       "Brad Carlile",
  title =        "Interpreting benchmarks",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "24",
  number =       "4",
  pages =        "20--21",
  month =        sep,
  year =         "1996",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:13 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Chase:1996:RW,
  author =       "David Chase",
  title =        "Register windows",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "24",
  number =       "4",
  pages =        "21--21",
  month =        sep,
  year =         "1996",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:13 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{DeMone:1996:RWD,
  author =       "Paul W. DeMone",
  title =        "Register windows and delay slots",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "24",
  number =       "4",
  pages =        "21--22",
  month =        sep,
  year =         "1996",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:13 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Rose:1996:CIT,
  author =       "Charlton D. Rose and J. Kelly Flanagan",
  title =        "Constructing instruction traces from cache-filtered
                 address traces {(CITCAT)}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "24",
  number =       "5",
  pages =        "1--8",
  month =        dec,
  year =         "1996",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:20 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hummel:1996:EDS,
  author =       "Susan Flynn Hummel",
  title =        "Efficient data sharing with conditional remote memory
                 transfers",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "24",
  number =       "5",
  pages =        "9--17",
  month =        dec,
  year =         "1996",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:20 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Widigen:1996:EOR,
  author =       "Larry Widigen and Elliot Sowadsky and Kevin McGrath",
  title =        "Eliminating operand read latency",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "24",
  number =       "5",
  pages =        "18--22",
  month =        dec,
  year =         "1996",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:20 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Machanick:1996:CSM,
  author =       "Philip Machanick",
  title =        "The case for {SRAM} main memory",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "24",
  number =       "5",
  pages =        "23--30",
  month =        dec,
  year =         "1996",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:20 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Bhandarkar:1997:RVC,
  author =       "Dileep Bhandarkar",
  title =        "{RISC} versus {CISC}: a tale of two chips",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "25",
  number =       "1",
  pages =        "1--12",
  month =        mar,
  year =         "1997",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:35 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Martin:1997:SCM,
  author =       "I. Mart{\'\i}n and F. Tirado",
  title =        "A {SIMD} computer for multigrid methods",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "25",
  number =       "1",
  pages =        "13--18",
  month =        mar,
  year =         "1997",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:35 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Weicker:1997:USB,
  author =       "Reinhold Weicker",
  title =        "On the use of {SPEC} benchmarks in computer
                 architecture research",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "25",
  number =       "1",
  pages =        "19--22",
  month =        mar,
  year =         "1997",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:35 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Mukherjee:1997:WSG,
  author =       "Shubhendu S. Mukherjee",
  title =        "What should graduate students know before joining a
                 large computer architecture project?",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "25",
  number =       "1",
  pages =        "23--26",
  month =        mar,
  year =         "1997",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:35 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Khalid:1997:NCR,
  author =       "Humayun Khalid",
  title =        "A new cache replacement scheme based on
                 backpropagation neural networks",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "25",
  number =       "1",
  pages =        "27--33",
  month =        mar,
  year =         "1997",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:35 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Thorson:1997:INa,
  author =       "Mark Thorson",
  title =        "{Internet} Nuggets",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "25",
  number =       "1",
  pages =        "34--36",
  month =        mar,
  year =         "1997",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:35 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Vajapeyam:1997:ISI,
  author =       "Sriram Vajapeyam and Tulika Mitra",
  title =        "Improving superscalar instruction dispatch and issue
                 by exploiting dynamic code sequences",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "25",
  number =       "2",
  pages =        "1--12",
  month =        may,
  year =         "1997",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:48 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Nair:1997:EIL,
  author =       "Ravi Nair and Martin E. Hopkins",
  title =        "Exploiting instruction level parallelism in processors
                 by caching scheduled groups",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "25",
  number =       "2",
  pages =        "13--25",
  month =        may,
  year =         "1997",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:48 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ebcioglu:1997:DDC,
  author =       "Kemal Ebcio{\u{g}}lu and Erik R. Altman",
  title =        "{DAISY}: dynamic compilation for 100\% architectural
                 compatibility",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "25",
  number =       "2",
  pages =        "26--37",
  month =        may,
  year =         "1997",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:48 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Pinkston:1997:DIN,
  author =       "Timothy Mark Pinkston and Sugath Warnakulasuriya",
  title =        "On deadlocks in interconnection networks",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "25",
  number =       "2",
  pages =        "38--49",
  month =        may,
  year =         "1997",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:48 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Stunkel:1997:IMW,
  author =       "Craig B. Stunkel and Rajeev Sivaram and Dhabaleswar K.
                 Panda",
  title =        "Implementing multidestination worms in switch-based
                 parallel systems: architectural alternatives and their
                 impact",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "25",
  number =       "2",
  pages =        "50--61",
  month =        may,
  year =         "1997",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:48 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Alvarez:1997:TMF,
  author =       "Guillermo A. Alvarez and Walter A. Burkhard and Flaviu
                 Cristian",
  title =        "Tolerating multiple failures in {RAID} architectures
                 with optimal storage and uniform declustering",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "25",
  number =       "2",
  pages =        "62--72",
  month =        may,
  year =         "1997",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:48 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Teodosiu:1997:HFC,
  author =       "Dan Teodosiu and Joel Baxter and Kinshuk Govil and
                 John Chapin and Mendel Rosenblum and Mark Horowitz",
  title =        "Hardware fault containment in scalable shared-memory
                 multiprocessors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "25",
  number =       "2",
  pages =        "73--84",
  month =        may,
  year =         "1997",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:48 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Martin:1997:ECL,
  author =       "Richard P. Martin and Amin M. Vahdat and David E.
                 Culler and Thomas E. Anderson",
  title =        "Effects of communication latency, overhead, and
                 bandwidth in a cluster architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "25",
  number =       "2",
  pages =        "85--97",
  month =        may,
  year =         "1997",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:48 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Weber:1997:MIA,
  author =       "Wolf-Dietrich Weber and Stephen Gold and Pat Helland
                 and Takeshi Shimizu and Thomas Wicki and Winfried
                 Wilcke",
  title =        "The {Mercury Interconnect Architecture}: a
                 cost-effective infrastructure for high-performance
                 servers",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "25",
  number =       "2",
  pages =        "98--107",
  month =        may,
  year =         "1997",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:48 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hakura:1997:DAC,
  author =       "Ziyad S. Hakura and Anoop Gupta",
  title =        "The design and analysis of a cache architecture for
                 texture mapping",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "25",
  number =       "2",
  pages =        "108--120",
  month =        may,
  year =         "1997",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:48 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Wilson:1997:DHB,
  author =       "Kenneth M. Wilson and Kunle Olukotun",
  title =        "Designing high bandwidth on-chip caches",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "25",
  number =       "2",
  pages =        "121--132",
  month =        may,
  year =         "1997",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:48 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Farkas:1997:MSD,
  author =       "Keith I. Farkas and Paul Chow and Norman P. Jouppi and
                 Zvonko Vranesic",
  title =        "Memory-system design considerations for
                 dynamically-scheduled processors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "25",
  number =       "2",
  pages =        "133--143",
  month =        may,
  year =         "1997",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:48 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ranganathan:1997:ISP,
  author =       "Parthasarathy Ranganathan and Vijay S. Pai and Hazim
                 Abdel-Shafi and Sarita V. Adve",
  title =        "The interaction of software prefetching with {ILP}
                 processors in shared-memory systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "25",
  number =       "2",
  pages =        "144--156",
  month =        may,
  year =         "1997",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:48 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kontothanassis:1997:VBS,
  author =       "Leonidas Kontothanassis and Galen Hunt and Robert
                 Stets and Nikolaos Hardavellas and Micha{\l} Cierniak
                 and Srinivasan Parthasarathy and Wagner {Meira, Jr.}
                 and Sandhya Dwarkadas and Michael Scott",
  title =        "{VM}-based shared memory on low-latency,
                 remote-memory-access networks",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "25",
  number =       "2",
  pages =        "157--169",
  month =        may,
  year =         "1997",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:48 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kagi:1997:ESL,
  author =       "Alain K{\"a}gi and Doug Burger and James R. Goodman",
  title =        "Efficient synchronization: let them eat {QOLB}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "25",
  number =       "2",
  pages =        "170--180",
  month =        may,
  year =         "1997",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:48 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Moshovos:1997:DSS,
  author =       "Andreas Moshovos and Scott E. Breach and T. N.
                 Vijaykumar and Gurindar S. Sohi",
  title =        "Dynamic speculation and synchronization of data
                 dependences",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "25",
  number =       "2",
  pages =        "181--193",
  month =        may,
  year =         "1997",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:48 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Sodani:1997:DIR,
  author =       "Avinash Sodani and Gurindar S. Sohi",
  title =        "Dynamic instruction reuse",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "25",
  number =       "2",
  pages =        "194--205",
  month =        may,
  year =         "1997",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:48 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Palacharla:1997:CES,
  author =       "Subbarao Palacharla and Norman P. Jouppi and J. E.
                 Smith",
  title =        "Complexity-effective superscalar processors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "25",
  number =       "2",
  pages =        "206--218",
  month =        may,
  year =         "1997",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:48 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Michael:1997:CCA,
  author =       "Maged M. Michael and Ashwini K. Nanda and Beng-Hong
                 Lim and Michael L. Scott",
  title =        "Coherence controller architectures for {SMP}-based
                 {CC-NUMA} multiprocessors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "25",
  number =       "2",
  pages =        "219--228",
  month =        may,
  year =         "1997",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:48 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Falsafi:1997:RND,
  author =       "Babak Falsafi and David A. Wood",
  title =        "Reactive {NUMA}: a design for unifying {S-COMA} and
                 {CC-NUMA}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "25",
  number =       "2",
  pages =        "229--240",
  month =        may,
  year =         "1997",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:48 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Laudon:1997:SOC,
  author =       "James Laudon and Daniel Lenoski",
  title =        "The {SGI Origin}: a {ccNUMA} highly scalable server",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "25",
  number =       "2",
  pages =        "241--251",
  month =        may,
  year =         "1997",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:48 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Joseph:1997:PUM,
  author =       "Doug Joseph and Dirk Grunwald",
  title =        "Prefetching using {Markov} predictors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "25",
  number =       "2",
  pages =        "252--263",
  month =        may,
  year =         "1997",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:48 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Santhanam:1997:DPH,
  author =       "Vatsa Santhanam and Edward H. Gornish and Wei-Chung
                 Hsu",
  title =        "Data prefetching on the {HP PA-8000}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "25",
  number =       "2",
  pages =        "264--273",
  month =        may,
  year =         "1997",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:48 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Chang:1997:TPI,
  author =       "Po-Yung Chang and Eric Hao and Yale N. Patt",
  title =        "Target prediction for indirect jumps",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "25",
  number =       "2",
  pages =        "274--283",
  month =        may,
  year =         "1997",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:48 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Sprangle:1997:APM,
  author =       "Eric Sprangle and Robert S. Chappell and Mitch Alsup
                 and Yale N. Patt",
  title =        "The agree predictor: a mechanism for reducing negative
                 branch history interference",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "25",
  number =       "2",
  pages =        "284--291",
  month =        may,
  year =         "1997",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:48 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Michaud:1997:TCC,
  author =       "Pierre Michaud and Andr{\'e} Seznec and Richard
                 Uhlig",
  title =        "Trading conflict and capacity aliasing in conditional
                 branch predictors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "25",
  number =       "2",
  pages =        "292--303",
  month =        may,
  year =         "1997",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:48 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Emer:1997:LDP,
  author =       "Joel Emer and Nikolas Gloy",
  title =        "A language for describing predictors and its
                 application to automatic synthesis",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "25",
  number =       "2",
  pages =        "304--314",
  month =        may,
  year =         "1997",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:48 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Johnson:1997:RTA,
  author =       "Teresa L. Johnson and Wen-mei W. Hwu",
  title =        "Run-time adaptive cache hierarchy management via
                 reference analysis",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "25",
  number =       "2",
  pages =        "315--326",
  month =        may,
  year =         "1997",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:48 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Fromm:1997:EEI,
  author =       "Richard Fromm and Stylianos Perissakis and Neal
                 Cardwell and Christoforos Kozyrakis and Bruce McGaughy
                 and David Patterson and Tom Anderson and Katherine
                 Yelick",
  title =        "The energy efficiency of {IRAM} architectures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "25",
  number =       "2",
  pages =        "327--337",
  month =        may,
  year =         "1997",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:48 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Burger:1997:DA,
  author =       "Doug Burger and Stefanos Kaxiras and James R.
                 Goodman",
  title =        "{DataScalar} architectures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "25",
  number =       "2",
  pages =        "338--349",
  month =        may,
  year =         "1997",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:48 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Wilkes:1997:CLS,
  author =       "Maurice Wilkes and Andrew Hopper",
  title =        "The collapsed {LAN}: a solution to a bandwidth
                 problem?",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "25",
  number =       "3",
  pages =        "1--5",
  month =        jun,
  year =         "1997",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:58 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Jokinen:1997:CDP,
  author =       "Tommi Jokinen and Chia-Jiu Wang",
  title =        "Cache design with path balancing table, skewing and
                 indirect tags",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "25",
  number =       "3",
  pages =        "6--12",
  month =        jun,
  year =         "1997",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:58 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Burger:1997:STS,
  author =       "Doug Burger and Todd M. Austin",
  title =        "The {SimpleScalar} tool set, version 2.0",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "25",
  number =       "3",
  pages =        "13--25",
  month =        jun,
  year =         "1997",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:58 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Thorson:1997:INb,
  author =       "Mark Thorson",
  title =        "{Internet} Nuggets",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "25",
  number =       "3",
  pages =        "26--27",
  month =        jun,
  year =         "1997",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:58 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{VanMeter:1997:RCL,
  author =       "Rodney {Van Meter} and Greg Finn and Steve Hotz and
                 Dave Dyer",
  title =        "Response to the collapsed {LAN}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "25",
  number =       "4",
  pages =        "1--12",
  month =        sep,
  year =         "1997",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:14 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hu:1997:OES,
  author =       "Weiwu Hu and Peisu Xia",
  title =        "Out-of-order execution in sequentially consistent
                 shared-memory systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "25",
  number =       "4",
  pages =        "3--10",
  month =        sep,
  year =         "1997",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:14 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Khalid:1997:NTS,
  author =       "Humayun Khalid",
  title =        "A novel trace sampling technique",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "25",
  number =       "4",
  pages =        "11--16",
  month =        sep,
  year =         "1997",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:14 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Khalid:1997:PKC,
  author =       "Humayun Khalid",
  title =        "Performance of the {KORA-2} cache replacement scheme",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "25",
  number =       "4",
  pages =        "17--21",
  month =        sep,
  year =         "1997",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:14 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Jutla:1997:IAP,
  author =       "D. N. Jutla and P. Bodorik",
  title =        "Improving applications performance: a memory model and
                 cache architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "25",
  number =       "4",
  pages =        "22--29",
  month =        sep,
  year =         "1997",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:14 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ulmann:1997:NEP,
  author =       "B. Ulmann",
  title =        "{NICE}: an elegant and powerful 32-bit architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "25",
  number =       "4",
  pages =        "30--35",
  month =        sep,
  year =         "1997",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:14 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Thorson:1997:INc,
  author =       "Mark Thorson",
  title =        "{Internet} Nuggets",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "25",
  number =       "4",
  pages =        "36--41",
  month =        sep,
  year =         "1997",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:14 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Pai:1997:RRS,
  author =       "Vijay S. Pai and Parthasarathy Ranganathan and Sarita
                 V. Adve",
  title =        "{RSIM}: {Rice} simulator for {ILP} multiprocessors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "25",
  number =       "5",
  pages =        "1--1",
  month =        dec,
  year =         "1997",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:21 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Shi:1997:IID,
  author =       "Weisong Shi and Weiwu Hu and Ming Zhu",
  title =        "An innovative implementation for directory-based cache
                 coherence in shared memory multiprocessors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "25",
  number =       "5",
  pages =        "2--9",
  month =        dec,
  year =         "1997",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:21 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Thorson:1997:INd,
  author =       "Mark Thorson",
  title =        "{Internet} Nuggets",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "25",
  number =       "5",
  pages =        "10--14",
  month =        dec,
  year =         "1997",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:21 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ulmann:1998:ILE,
  author =       "B. Ulmann",
  title =        "Instruction looping, an extension to conditional
                 execution",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "26",
  number =       "1",
  pages =        "3--4",
  month =        mar,
  year =         "1998",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1216461.1216462",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 12:06:32 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "The following article describes an easy to implement
                 but very powerful extension to simple conditional
                 execution based program flow control as used for
                 example in the ARM RISC processors and others.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Haring:1998:IWP,
  author =       "G{\"u}nter Haring and Christoph Lindemann and Martin
                 Reiser",
  title =        "International workshop performance evaluation ---
                 origins and directions",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "26",
  number =       "1",
  pages =        "5--6",
  month =        mar,
  year =         "1998",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1216461.1216463",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 12:06:32 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Performance Evaluation is a discipline of Computer
                 Science for some thirty years. It seems time to take
                 stock of what we were doing. That is, provide answers
                 to the following questions:{\bullet} What are its
                 scientific contributions?{\bullet} What is its
                 relevance in industry and business?{\bullet} What is
                 its standing in academia?{\bullet} Where is the field
                 headed?{\bullet} What are its success stories and
                 failures?{\bullet} What are its current burning
                 questions?",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Munsil:1998:RSU,
  author =       "Wes Munsil and Chia-Jiu Wang",
  title =        "Reducing stack usage in {Java} bytecode execution",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "26",
  number =       "1",
  pages =        "7--11",
  month =        mar,
  year =         "1998",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1216461.1216464",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 12:06:32 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "For many years, the Tomasulo method of dynamically
                 scheduling instructions for execution in a load/store
                 processor has been known and used. This paper presents
                 an adaptation of the Tomasulo method to a stack-based
                 processor architecture, and illustrates its use in a
                 software simulator of a subset of the Java Virtual
                 Machine. Experimental results show that the adapted
                 Tomasulo method reduces stack usage, in some cases
                 eliminating it altogether. This method should be of
                 interest to computer architects and those involved in
                 the implementation and use of the Java programming
                 language.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Thorson:1998:INaa,
  author =       "Mark Thorson",
  title =        "{Internet} nuggets",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "26",
  number =       "1",
  pages =        "12--17",
  month =        mar,
  year =         "1998",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1216461.1216465",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 12:06:32 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "This column consists of selected traffic from the
                 comp.arch newsgroup, a forum for discussion of computer
                 architecture on Internet --- an international computer
                 network. As always, the opinions expressed in this
                 column are the personal views of the authors, and do
                 not necessarily represent the institutions to which
                 they are affiliated. Text which sets the context of a
                 message appears in italics; this is usually text the
                 author has quoted from earlier messages. The code-like
                 expressions below the authors' names are their
                 addresses on Internet.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Moudgill:1998:TFS,
  author =       "Mayan Moudgill",
  title =        "Techniques for fast simulation of associative cache
                 directories",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "26",
  number =       "2",
  pages =        "1--8",
  month =        may,
  year =         "1998",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:48 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Chung:1998:LBC,
  author =       "Byung-Kwon Chung and Jih-Kwon Peir",
  title =        "{LRU}-based column-associative caches",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "26",
  number =       "2",
  pages =        "9--17",
  month =        may,
  year =         "1998",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:48 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Thorson:1998:INb,
  author =       "Mark Thorson",
  title =        "{Internet} Nuggets",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "26",
  number =       "2",
  pages =        "18--22",
  month =        may,
  year =         "1998",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:48 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Barroso:1998:MSC,
  author =       "Luiz Andr{\'e} Barroso and Kourosh Gharachorloo and
                 Edouard Bugnion",
  title =        "Memory system characterization of commercial
                 workloads",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "26",
  number =       "3",
  pages =        "3--14",
  month =        jun,
  year =         "1998",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:58 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Keeton:1998:PCQ,
  author =       "Kimberly Keeton and David A. Patterson and Yong Qiang
                 He and Roger C. Raphael and Walter E. Baker",
  title =        "Performance characterization of a {Quad Pentium Pro
                 SMP} using {OLTP} workloads",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "26",
  number =       "3",
  pages =        "15--26",
  month =        jun,
  year =         "1998",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:58 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Lee:1998:ECD,
  author =       "Dennis C. Lee and Patrick J. Crowley and Jean-Loup
                 Baer and Thomas E. Anderson and Brian N. Bershad",
  title =        "Execution characteristics of desktop applications on
                 {Windows NT}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "26",
  number =       "3",
  pages =        "27--38",
  month =        jun,
  year =         "1998",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:58 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Lo:1998:ADW,
  author =       "Jack L. Lo and Luiz Andr{\'e} Barroso and Susan J.
                 Eggers and Kourosh Gharachorloo and Henry M. Levy and
                 Sujay S. Parekh",
  title =        "An analysis of database workload performance on
                 simultaneous multithreaded processors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "26",
  number =       "3",
  pages =        "39--50",
  month =        jun,
  year =         "1998",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:58 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Evers:1998:ACP,
  author =       "Marius Evers and Sanjay J. Patel and Robert S.
                 Chappell and Yale N. Patt",
  title =        "An analysis of correlation and predictability: what
                 makes two-level branch predictors work",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "26",
  number =       "3",
  pages =        "52--61",
  month =        jun,
  year =         "1998",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:58 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Federovsky:1998:BPB,
  author =       "Eitan Federovsky and Meir Feder and Sholomo Weiss",
  title =        "Branch prediction based on universal data compression
                 algorithms",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "26",
  number =       "3",
  pages =        "62--72",
  month =        jun,
  year =         "1998",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:58 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Sazeides:1998:MPP,
  author =       "Yiannakis Sazeides and James E. Smith",
  title =        "Modeling program predictability",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "26",
  number =       "3",
  pages =        "73--84",
  month =        jun,
  year =         "1998",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:58 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Cox:1998:MLT,
  author =       "Michael Cox and Narendra Bhandari and Michael Shantz",
  title =        "Multi-level texture caching for {$3$D} graphics
                 hardware",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "26",
  number =       "3",
  pages =        "86--97",
  month =        jun,
  year =         "1998",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:58 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Eberle:1998:SQC,
  author =       "Hans Eberle and Erwin Oertli",
  title =        "{Switcherland}: a {QoS} communication architecture for
                 workstation clusters",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "26",
  number =       "3",
  pages =        "98--108",
  month =        jun,
  year =         "1998",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:58 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Alvarez:1998:DDA,
  author =       "Guillermo A. Alvarez and Walter A. Burkhard and Larry
                 J. Stockmeyer and Flaviu Cristian",
  title =        "Declustered disk array architectures with optimal and
                 near-optimal parallelism",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "26",
  number =       "3",
  pages =        "109--120",
  month =        jun,
  year =         "1998",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:58 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Grunwald:1998:CES,
  author =       "Dirk Grunwald and Artur Klauser and Srilatha Manne and
                 Andrew Pleszkun",
  title =        "Confidence estimation for speculation control",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "26",
  number =       "3",
  pages =        "122--131",
  month =        jun,
  year =         "1998",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:58 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Manne:1998:PGS,
  author =       "Srilatha Manne and Artur Klauser and Dirk Grunwald",
  title =        "Pipeline gating: speculation control for energy
                 reduction",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "26",
  number =       "3",
  pages =        "132--141",
  month =        jun,
  year =         "1998",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:58 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Chrysos:1998:MDP,
  author =       "George Z. Chrysos and Joel S. Emer",
  title =        "Memory dependence prediction using store sets",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "26",
  number =       "3",
  pages =        "142--153",
  month =        jun,
  year =         "1998",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:58 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Juan:1998:DHL,
  author =       "Toni Juan and Sanji Sanjeevan and Juan J. Navarro",
  title =        "Dynamic history-length fitting: a third level of
                 adaptivity for branch prediction",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "26",
  number =       "3",
  pages =        "155--166",
  month =        jun,
  year =         "1998",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:58 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Driesen:1998:AIB,
  author =       "Karel Driesen and Urs H{\"o}lzle",
  title =        "Accurate indirect branch prediction",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "26",
  number =       "3",
  pages =        "167--178",
  month =        jun,
  year =         "1998",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:58 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Mukherjee:1998:UPA,
  author =       "Shubhendu S. Mukherjee and Mark D. Hill",
  title =        "Using prediction to accelerate coherence protocols",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "26",
  number =       "3",
  pages =        "179--190",
  month =        jun,
  year =         "1998",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:58 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Oskin:1998:APC,
  author =       "Mark Oskin and Frederic T. Chong and Timothy
                 Sherwood",
  title =        "Active pages: a computation model for intelligent
                 memory",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "26",
  number =       "3",
  pages =        "192--203",
  month =        jun,
  year =         "1998",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:58 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Swanson:1998:ITR,
  author =       "Mark Swanson and Leigh Stoller and John Carter",
  title =        "Increasing {TLB} reach using superpages backed by
                 shadow memory",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "26",
  number =       "3",
  pages =        "204--213",
  month =        jun,
  year =         "1998",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:58 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Qiu:1998:ODA,
  author =       "Xiaogang Qiu and Michel Dubois",
  title =        "Options for dynamic address translation in {COMAs}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "26",
  number =       "3",
  pages =        "214--225",
  month =        jun,
  year =         "1998",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:58 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{August:1998:IPS,
  author =       "David I. August and Daniel A. Connors and Scott A.
                 Mahlke and John W. Sias and Kevin M. Crozier and
                 Ben-Chung Cheng and Patrick R. Eaton and Qudus B.
                 Olaniran and Wen-mei W. Hwu",
  title =        "Integrated predicated and speculative execution in the
                 {IMPACT EPIC} architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "26",
  number =       "3",
  pages =        "227--237",
  month =        jun,
  year =         "1998",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:58 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Wallace:1998:TMP,
  author =       "Steven Wallace and Brad Calder and Dean M. Tullsen",
  title =        "Threaded multiple path execution",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "26",
  number =       "3",
  pages =        "238--249",
  month =        jun,
  year =         "1998",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:58 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Klauser:1998:SEE,
  author =       "Artur Klauser and Abhijit Paithankar and Dirk
                 Grunwald",
  title =        "Selective eager execution on the {PolyPath}
                 architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "26",
  number =       "3",
  pages =        "250--259",
  month =        jun,
  year =         "1998",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:58 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Patel:1998:ITC,
  author =       "Sanjay Jeram Patel and Marius Evers and Yale N. Patt",
  title =        "Improving trace cache effectiveness with branch
                 promotion and trace packing",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "26",
  number =       "3",
  pages =        "262--271",
  month =        jun,
  year =         "1998",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:58 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Gabbay:1998:EIF,
  author =       "Freddy Gabbay and Avi Mendelson",
  title =        "The effect of instruction fetch bandwidth on value
                 prediction",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "26",
  number =       "3",
  pages =        "272--281",
  month =        jun,
  year =         "1998",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:58 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Albonesi:1998:DIC,
  author =       "David H. Albonesi",
  title =        "Dynamic {IPC\slash clock} rate optimization",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "26",
  number =       "3",
  pages =        "282--292",
  month =        jun,
  year =         "1998",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:58 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Zhang:1998:PMC,
  author =       "Yinong Zhang and George B. {Adams III}",
  title =        "Performance modeling and code partitioning for the
                 {DS} architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "26",
  number =       "3",
  pages =        "293--304",
  month =        jun,
  year =         "1998",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:58 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Keckler:1998:EFG,
  author =       "Stephen W. Keckler and William J. Dally and Daniel
                 Maskit and Nicholas P. Carter and Andrew Chang and Whay
                 S. Lee",
  title =        "Exploiting fine-grain thread level parallelism on the
                 {MIT} multi-{ALU} processor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "26",
  number =       "3",
  pages =        "306--317",
  month =        jun,
  year =         "1998",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:58 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Abandah:1998:EAT,
  author =       "Gheith A. Abandah and Edward S. Davidson",
  title =        "Effects of architectural and technological advances on
                 the {HP\slash Convex Exemplar}'s memory and
                 communication performance",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "26",
  number =       "3",
  pages =        "318--329",
  month =        jun,
  year =         "1998",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:58 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Blumrich:1998:DCS,
  author =       "Matthias A. Blumrich and Richard D. Alpert and Yuqun
                 Chen and Douglas W. Clark and Stefanos N. Damianakis
                 and Cezary Dubnicki and Edward W. Felten and Liviu
                 Iftode and Kai Li and Margaret Martonosi and Robert A.
                 Shillner",
  title =        "Design choices in the {SHRIMP} system: an empirical
                 study",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "26",
  number =       "3",
  pages =        "330--341",
  month =        jun,
  year =         "1998",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:58 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Soundararajan:1998:FUM,
  author =       "Vijayaraghavan Soundararajan and Mark Heinrich and Ben
                 Verghese and Kourosh Gharachorloo and Anoop Gupta and
                 John Hennessy",
  title =        "Flexible use of memory for replication\slash migration
                 in cache-coherent {DSM} multiprocessors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "26",
  number =       "3",
  pages =        "342--355",
  month =        jun,
  year =         "1998",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:58 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kumar:1998:ESL,
  author =       "Sanjeev Kumar and Christopher Wilkerson",
  title =        "Exploiting spatial locality in data caches using
                 spatial footprints",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "26",
  number =       "3",
  pages =        "357--368",
  month =        jun,
  year =         "1998",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:58 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Lynch:1998:LLL,
  author =       "William L. Lynch and Gary Lauterbach and Joseph I.
                 Chamdani",
  title =        "Low load latency through sum-addressed memory
                 {(SAM)}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "26",
  number =       "3",
  pages =        "369--379",
  month =        jun,
  year =         "1998",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:58 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Sorin:1998:AES,
  author =       "Daniel J. Sorin and Vijay S. Pai and Sarita V. Adve
                 and Mary K. Vernon and David A. Wood",
  title =        "Analytic evaluation of shared-memory systems with
                 {ILP} processors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "26",
  number =       "3",
  pages =        "380--391",
  month =        jun,
  year =         "1998",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:58 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Golla:1998:CEB,
  author =       "Prasad N. Golla and Eric C. Lin",
  title =        "A comparison of the effect of branch prediction on
                 multithreaded and scalar architectures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "26",
  number =       "4",
  pages =        "3--11",
  month =        sep,
  year =         "1998",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1216475.1216476",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 12:06:40 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Speculative instructions execution requires dynamic
                 branch predictors to increase the performance of a
                 processor by executing from predicted branch target
                 routines. Conventional Scalar architectures such as the
                 Superscalar or Multiscalar architecture executes from a
                 single stream, while a Multithreaded architecture
                 executes from multiple streams at a time. Several
                 aggressive branch predictors have been proposed with
                 high prediction accuracies. Unfortunately, none of the
                 branch predictors can provide 100\% accuracy.
                 Therefore, there is an inherent limitation on
                 speculative execution in real implementation. In this
                 paper, we show that Multithreaded architecture is a
                 better candidate for utilizing speculative execution
                 than Scalar architectures. Generally the branch
                 prediction performance degradation is compounded for
                 larger window sizes on Scalar architectures, while for
                 a Multithreaded architecture, by increasing the number
                 of executing threads, we could sustain a higher
                 performance for a large aggregated speculative window
                 size. Hence, heavier workloads may increase performance
                 and utilization for Multithreaded architectures. We
                 present analytical and simulation results to support
                 our argument.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Thorson:1998:INc,
  author =       "Mark Thorson",
  title =        "{Internet} nuggets",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "26",
  number =       "4",
  pages =        "12--16",
  month =        sep,
  year =         "1998",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1216475.1216477",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 12:06:40 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "This column consists of selected traffic from the
                 comp.arch newsgroup, a forum for discussion of computer
                 architecture on Internet---an international computer
                 network. As always, the opinions expressed in this
                 column are the personal views of the authors, and do
                 not necessarily represent the institutions to which
                 they are affiliated. Text which sets the context of a
                 message appears in italics; this is usually text the
                 author has quoted from earlier messages. The code-like
                 expressions below the authors' names are their
                 addresses on Internet.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Machanick:1998:SVL,
  author =       "Philip Machanick",
  title =        "Streaming vs. latency in information mass-transit",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "26",
  number =       "5",
  pages =        "4--6",
  month =        dec,
  year =         "1998",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:21 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Lafitte:1998:GMD,
  author =       "Jean-Louis Lafitte",
  title =        "A generalized mapping device to help memory latency",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "26",
  number =       "5",
  pages =        "7--13",
  month =        dec,
  year =         "1998",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:21 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ashraf:1998:IRM,
  author =       "Farooq Ashraf and Mostafa Abd-El-Barr and Khalid
                 Al-Tawil",
  title =        "Introduction to routing in multicomputer networks",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "26",
  number =       "5",
  pages =        "14--21",
  month =        dec,
  year =         "1998",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:21 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Wilmot:1998:DTM,
  author =       "Dick Wilmot",
  title =        "Data threaded microarchitecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "26",
  number =       "5",
  pages =        "22--32",
  month =        dec,
  year =         "1998",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:21 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Yuen:1999:SR,
  author =       "C. K. Yuen",
  title =        "Stack and {RISC}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "27",
  number =       "1",
  pages =        "3--9",
  month =        mar,
  year =         "1999",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:35 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Baylor:1999:USS,
  author =       "Sandra Johnson Baylor",
  title =        "Unified scalable shared memory architectures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "27",
  number =       "1",
  pages =        "10--21",
  month =        mar,
  year =         "1999",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:35 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{DeWitt:1999:PTL,
  author =       "Anthony DeWitt and Thomas Gross",
  title =        "The potential of thread-level speculation based on
                 value profiling",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "27",
  number =       "1",
  pages =        "22--22",
  month =        mar,
  year =         "1999",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:35 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kalamatianos:1999:IAI,
  author =       "John Kalamatianos and David R. Kaeli",
  title =        "Improving the accuracy of indirect branch prediction
                 via branch classification",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "27",
  number =       "1",
  pages =        "23--26",
  month =        mar,
  year =         "1999",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:35 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ju:1999:PMD,
  author =       "Roy Dz-ching Ju and Jean-Fran{\c{c}}ois Collard and
                 Karim Oukbir",
  title =        "Probabilistic memory disambiguation and its
                 application to data speculation",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "27",
  number =       "1",
  pages =        "27--30",
  month =        mar,
  year =         "1999",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:35 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Postiff:1999:LIL,
  author =       "Matthew A. Postiff and David A. Greene and Gary S.
                 Tyson and Trevor N. Mudge",
  title =        "The limits of instruction level parallelism in
                 {SPEC95} applications",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "27",
  number =       "1",
  pages =        "31--34",
  month =        mar,
  year =         "1999",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:35 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Yang:1999:LMJ,
  author =       "Byung-Sun Yang and Junpyo Lee and Jinpyo Park and
                 Soo-Mook Moon and Kemal Ebcio{\u{g}}lu and Erik
                 Altman",
  title =        "Lightweight monitor for {Java VM}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "27",
  number =       "1",
  pages =        "35--38",
  month =        mar,
  year =         "1999",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:35 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Rao:1999:SAU,
  author =       "Amit Rao and Santosh Pande",
  title =        "Storage assignment using expression tree
                 transformations to generate compact and efficient {DSP}
                 code",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "27",
  number =       "1",
  pages =        "39--42",
  month =        mar,
  year =         "1999",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:35 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Flautner:1999:HLS,
  author =       "Kriszti{\'a}n Flautner and Gary S. Tyson and Trevor
                 Mudge",
  title =        "A high level simulator integrated with the {Mirv}
                 compiler",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "27",
  number =       "1",
  pages =        "43--46",
  month =        mar,
  year =         "1999",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:35 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Casse:1999:UAI,
  author =       "H. Cass{\'e} and L. F{\'e}raud and C. Rochange and P.
                 Sainrat",
  title =        "Using the abstract interpretation technique for static
                 pointer analysis",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "27",
  number =       "1",
  pages =        "47--50",
  month =        mar,
  year =         "1999",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:35 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Bahar:1999:CSC,
  author =       "Iris Bahar and Brad Calder and Dirk Grunwald",
  title =        "A comparison of software code reordering and victim
                 buffers",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "27",
  number =       "1",
  pages =        "51--54",
  month =        mar,
  year =         "1999",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:35 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Carr:1999:ISP,
  author =       "Steve Carr and Philip Sweany",
  title =        "Improving software pipelining with hardware support
                 for self-spatial loads",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "27",
  number =       "1",
  pages =        "55--58",
  month =        mar,
  year =         "1999",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:35 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Barua:1999:MCM,
  author =       "Rajeev Barua and Walter Lee and Saman Amarasinghe and
                 Anant Agarwal",
  title =        "{Maps}: a compiler-managed memory system for raw
                 machines",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "27",
  number =       "2",
  pages =        "4--15",
  month =        may,
  year =         "1999",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:49 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Vajapeyam:1999:DVM,
  author =       "Sriram Vajapeyam and P. J. Joseph and Tulika Mitra",
  title =        "Dynamic vectorization: a mechanism for exploiting
                 far-flung {ILP} in ordinary programs",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "27",
  number =       "2",
  pages =        "16--27",
  month =        may,
  year =         "1999",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:49 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Goldstein:1999:PCP,
  author =       "Seth Copen Goldstein and Herman Schmit and Matthew Moe
                 and Mihai Budiu and Srihari Cadambi and R. Reed Taylor
                 and Ronald Laufer",
  title =        "{PipeRench}: a co\slash processor for streaming
                 multimedia acceleration",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "27",
  number =       "2",
  pages =        "28--39",
  month =        may,
  year =         "1999",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:49 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Yoaz:1999:STI,
  author =       "Adi Yoaz and Mattan Erez and Ronny Ronen and Stephan
                 Jourdan",
  title =        "Speculation techniques for improving load related
                 instruction scheduling",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "27",
  number =       "2",
  pages =        "42--53",
  month =        may,
  year =         "1999",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:49 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Bekerman:1999:CLA,
  author =       "Michael Bekerman and Stephan Jourdan and Ronny Ronen
                 and Gilad Kirshenboim and Lihu Rappoport and Adi Yoaz
                 and Uri Weiser",
  title =        "Correlated load-address predictors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "27",
  number =       "2",
  pages =        "54--63",
  month =        may,
  year =         "1999",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:49 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Calder:1999:SVP,
  author =       "Brad Calder and Glenn Reinman and Dean M. Tullsen",
  title =        "Selective value prediction",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "27",
  number =       "2",
  pages =        "64--74",
  month =        may,
  year =         "1999",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:49 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Qiu:1999:TLM,
  author =       "Xiaogang Qiu and Michel Dubois",
  title =        "Tolerating late memory traps in {ILP} processors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "27",
  number =       "2",
  pages =        "76--87",
  month =        may,
  year =         "1999",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:49 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Luk:1999:MFE,
  author =       "Chi-Keung Luk and Todd C. Mowry",
  title =        "Memory forwarding: enabling aggressive layout
                 optimizations by guaranteeing the safety of data
                 relocation",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "27",
  number =       "2",
  pages =        "88--99",
  month =        may,
  year =         "1999",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:49 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Cho:1999:DLV,
  author =       "Sangyeun Cho and Pen-Chung Yew and Gyungho Lee",
  title =        "Decoupling local variable accesses in a wide-issue
                 superscalar processor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "27",
  number =       "2",
  pages =        "100--110",
  month =        may,
  year =         "1999",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:49 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Roth:1999:EJP,
  author =       "Amir Roth and Gurindar S. Sohi",
  title =        "Effective jump-pointer prefetching for linked data
                 structures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "27",
  number =       "2",
  pages =        "111--121",
  month =        may,
  year =         "1999",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:49 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ranganathan:1999:PIV,
  author =       "Parthasarathy Ranganathan and Sarita Adve and Norman
                 P. Jouppi",
  title =        "Performance of image and video processing with
                 general-purpose processors and media {ISA} extensions",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "27",
  number =       "2",
  pages =        "124--135",
  month =        may,
  year =         "1999",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:49 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Merten:1999:HDP,
  author =       "Matthew C. Merten and Andrew R. Trick and Christopher
                 N. George and John C. Gyllenhaal and Wen-mei W. Hwu",
  title =        "A hardware-driven profiling scheme for identifying
                 program hot spots to support runtime optimization",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "27",
  number =       "2",
  pages =        "136--147",
  month =        may,
  year =         "1999",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:49 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Shen:1999:CRF,
  author =       "Xiaowei Shen and Arvind and Larry Rudolph",
  title =        "Commit-reconcile \& fences {(CRF)}: a new memory
                 model for architects and compiler writers",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "27",
  number =       "2",
  pages =        "150--161",
  month =        may,
  year =         "1999",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:49 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Gniady:1999:SIR,
  author =       "Chris Gniady and Babak Falsafi and T. N. Vijaykumar",
  title =        "Is {SC + ILP = RC}?",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "27",
  number =       "2",
  pages =        "162--171",
  month =        may,
  year =         "1999",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:49 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "Instruction level parallelism (ILP); release
                 consistency (RC); sequential consistency (SC)",
}

@Article{Lai:1999:MSP,
  author =       "An-Chow Lai and Babak Falsafi",
  title =        "Memory sharing predictor: the key to a speculative
                 coherent {DSM}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "27",
  number =       "2",
  pages =        "172--183",
  month =        may,
  year =         "1999",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:49 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Chappell:1999:SSM,
  author =       "Robert S. Chappell and Jared Stark and Sangwook P. Kim
                 and Steven K. Reinhardt and Yale N. Patt",
  title =        "Simultaneous subordinate microthreading {(SSMT)}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "27",
  number =       "2",
  pages =        "186--195",
  month =        may,
  year =         "1999",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:49 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Black:1999:BBT,
  author =       "Bryan Black and Bohuslav Rychlik and John Paul Shen",
  title =        "The block-based trace cache",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "27",
  number =       "2",
  pages =        "196--207",
  month =        may,
  year =         "1999",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:49 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{August:1999:PDL,
  author =       "David I. August and John W. Sias and Jean-Michel
                 Puiatti and Scott A. Mahlke and Daniel A. Connors and
                 Kevin M. Crozier and Wen-mei W. Hwu",
  title =        "The program decision logic approach to predicated
                 execution",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "27",
  number =       "2",
  pages =        "208--219",
  month =        may,
  year =         "1999",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:49 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Cuppu:1999:PCC,
  author =       "Vinodh Cuppu and Bruce Jacob and Brian Davis and
                 Trevor Mudge",
  title =        "A performance comparison of contemporary {DRAM}
                 architectures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "27",
  number =       "2",
  pages =        "222--233",
  month =        may,
  year =         "1999",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:49 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Reinman:1999:SFE,
  author =       "Glenn Reinman and Todd Austin and Brad Calder",
  title =        "A scalable front-end architecture for fast instruction
                 delivery",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "27",
  number =       "2",
  pages =        "234--245",
  month =        may,
  year =         "1999",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:49 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kim:1999:AEA,
  author =       "Seongwoo Kim and Arun K. Somani",
  title =        "Area efficient architectures for information integrity
                 in cache memories",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "27",
  number =       "2",
  pages =        "246--255",
  month =        may,
  year =         "1999",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:49 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Nakra:1999:VPV,
  author =       "Tarun Nakra and Rajiv Gupta and Mary Lou Soffa",
  title =        "Value prediction in {VLIW} machines",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "27",
  number =       "2",
  pages =        "258--269",
  month =        may,
  year =         "1999",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:49 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Tullsen:1999:SVP,
  author =       "Dean M. Tullsen and John S. Seng",
  title =        "Storageless value prediction using prior register
                 values",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "27",
  number =       "2",
  pages =        "270--279",
  month =        may,
  year =         "1999",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:49 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Bilas:1999:UNI,
  author =       "Angelos Bilas and Cheng Liao and Jaswinder Pal Singh",
  title =        "Using network interface support to avoid asynchronous
                 protocol processing in shared virtual memory systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "27",
  number =       "2",
  pages =        "282--293",
  month =        may,
  year =         "1999",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:49 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Bilir:1999:MSN,
  author =       "E. Ender Bilir and Ross M. Dickson and Ying Hu and
                 Manoj Plakal and Daniel J. Sorin and Mark D. Hill and
                 David A. Wood",
  title =        "Multicast snooping: a new coherence method using a
                 multicast address network",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "27",
  number =       "2",
  pages =        "294--304",
  month =        may,
  year =         "1999",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:49 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Jiang:1999:SAP,
  author =       "Dongming Jiang and Jaswinder Pal Singh",
  title =        "Scaling application performance on a cache-coherent
                 multiprocessor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "27",
  number =       "2",
  pages =        "305--316",
  month =        may,
  year =         "1999",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:49 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Anonymous:1999:MSF,
  author =       "Anonymous",
  title =        "In memoriam---{SIGARCH} founder: {Caxton C. Foster}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "27",
  number =       "3",
  pages =        "1--3",
  month =        jun,
  year =         "1999",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:59 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hwang:1999:SSI,
  author =       "Seung H. Hwang and Gwan S. Choi",
  title =        "Selective-set-invalidation {(SSI)} for
                 soft-error-resilient cache architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "27",
  number =       "3",
  pages =        "4--9",
  month =        jun,
  year =         "1999",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:59 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Cheng:1999:DHP,
  author =       "Peng Cheng and Hai Jin and Jiangling Zhang",
  title =        "Design of high performance {RAID} in real-time
                 system",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "27",
  number =       "3",
  pages =        "10--17",
  month =        jun,
  year =         "1999",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:59 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Yuen:1999:ASC,
  author =       "C. K. Yuen",
  title =        "Architectural support for the cache based vector
                 computation",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "27",
  number =       "3",
  pages =        "18--23",
  month =        jun,
  year =         "1999",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:59 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Driker:1999:DCC,
  author =       "Benjamin Driker",
  title =        "Disbursed control computer architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "27",
  number =       "3",
  pages =        "24--31",
  month =        jun,
  year =         "1999",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:59 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Khalid:1999:PEM,
  author =       "Humayun Khalid",
  title =        "Performance evaluation of multimedia systems with
                 {MPEG-2} bitstreams",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "27",
  number =       "3",
  pages =        "32--37",
  month =        jun,
  year =         "1999",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:59 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Khalid:1999:MPE,
  author =       "Humayun Khalid",
  title =        "A methodology for performance evaluation of systems
                 with large emulation code",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "27",
  number =       "3",
  pages =        "38--42",
  month =        jun,
  year =         "1999",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:59 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Khalid:1999:TMB,
  author =       "Humayun Khalid",
  title =        "Tracing multimedia benchmarks with five degrees of
                 validation",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "27",
  number =       "3",
  pages =        "43--48",
  month =        jun,
  year =         "1999",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:59 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Khalid:1999:PET,
  author =       "Humayun Khalid",
  title =        "Performance evaluation of two operating systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "27",
  number =       "3",
  pages =        "49--52",
  month =        jun,
  year =         "1999",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:59 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Thorson:1999:INa,
  author =       "Mark Thorson",
  title =        "{Internet} Nuggets",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "27",
  number =       "3",
  pages =        "53--60",
  month =        jun,
  year =         "1999",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:59 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Machanick:1999:CRA,
  author =       "Phillip Machanick",
  title =        "Correction to {RAMpage ASPOLOS} paper",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "27",
  number =       "4",
  pages =        "2--5",
  month =        sep,
  year =         "1999",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:14 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Shahhoseini:1999:ABP,
  author =       "H. S. Shahhoseini and M. Naderi and S. Nemati",
  title =        "Achieving the best performance on superscalar
                 processors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "27",
  number =       "4",
  pages =        "6--11",
  month =        sep,
  year =         "1999",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:14 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Thorson:1999:INb,
  author =       "Mark Thorson",
  title =        "{Internet} Nuggets",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "27",
  number =       "4",
  pages =        "12--14",
  month =        sep,
  year =         "1999",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:14 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Torrant:1999:SMS,
  author =       "Marc Torrant and Muhammad Shaaban and Roy Czernikowski
                 and Ken Hsu",
  title =        "A simultaneous multithreading simulator",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "27",
  number =       "5",
  pages =        "1--5",
  month =        dec,
  year =         "1999",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:22 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Thorson:1999:INc,
  author =       "Mark Thorson",
  title =        "{Internet} Nuggets",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "27",
  number =       "5",
  pages =        "6--10",
  month =        dec,
  year =         "1999",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:22 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Dai:2000:LSO,
  author =       "Min Dai and Christine Eisenbeis and Sid-Ahmed-Ali
                 Touati",
  title =        "Load-store optimization for software pipelining",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "28",
  number =       "1",
  pages =        "3--10",
  month =        mar,
  year =         "2000",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:36 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Clauss:2000:AML,
  author =       "Philippe Clauss and Beno{\^\i}t Meister",
  title =        "Automatic memory layout transformations to optimize
                 spatial locality in parameterized loop nests",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "28",
  number =       "1",
  pages =        "11--19",
  month =        mar,
  year =         "2000",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:36 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kreaseck:2000:LTB,
  author =       "Barbara Kreaseck and Dean Tullsen and Brad Calder",
  title =        "Limits of task-based parallelism in irregular
                 applications",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "28",
  number =       "1",
  pages =        "20--20",
  month =        mar,
  year =         "2000",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:36 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Lee:2000:RVC,
  author =       "Junpyo Lee and Byung-Sun Yang and Suhyun Kim and Kemal
                 Ebcio{\u{g}}lu and Erik Altman and Seungil Lee and Yoo
                 C. Chung and Heungbok Lee and Je Hyung Lee and Soo-Mook
                 Moon",
  title =        "Reducing virtual call overheads in a {Java VM}
                 just-in-time compiler",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "28",
  number =       "1",
  pages =        "21--33",
  month =        mar,
  year =         "2000",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:36 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Sadler:2000:APE,
  author =       "Chris Sadler and Sandeep K. S. Gupta and Rohit
                 Bhatia",
  title =        "Applying predication to efficiently handle runtime
                 class testing",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "28",
  number =       "1",
  pages =        "34--42",
  month =        mar,
  year =         "2000",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:36 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Bermudo:2000:OCM,
  author =       "Nerina Bermudo and Xavier Vera and Antonio
                 Gonz{\'a}lez and Josep Llosa",
  title =        "Optimizing cache miss equations polyhedra",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "28",
  number =       "1",
  pages =        "43--52",
  month =        mar,
  year =         "2000",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:36 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Unger:2000:CCA,
  author =       "A. Unger and E. Zehendner and Th. Ungerer",
  title =        "A combined compiler and architecture technique to
                 control multithreaded execution of branches and loop
                 iterations",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "28",
  number =       "1",
  pages =        "53--61",
  month =        mar,
  year =         "2000",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:36 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Aydin:2000:UCL,
  author =       "Hakan Aydin and David Kaeli",
  title =        "Using cache line coloring to perform aggressive
                 procedure inlining",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "28",
  number =       "1",
  pages =        "62--71",
  month =        mar,
  year =         "2000",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:36 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Tyagi:2000:COP,
  author =       "Akhilesh Tyagi and Gyungho Lee",
  title =        "A compiler optimization paradigm for dynamic energy
                 management",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "28",
  number =       "1",
  pages =        "72--76",
  month =        mar,
  year =         "2000",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:36 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Thorson:2000:INa,
  author =       "Mark Thorson",
  title =        "{Internet} Nuggets",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "28",
  number =       "1",
  pages =        "77--78",
  month =        mar,
  year =         "2000",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:36 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Steffan:2000:SAT,
  author =       "J. Greggory Steffan and Christopher B. Colohan and
                 Antonia Zhai and Todd C. Mowry",
  title =        "A scalable approach to thread-level speculation",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "28",
  number =       "2",
  pages =        "1--12",
  month =        may,
  year =         "2000",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:49 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Cintra:2000:ASS,
  author =       "Marcelo Cintra and Jos{\'e} F. Mart{\'\i}nez and Josep
                 Torrellas",
  title =        "Architectural support for scalable speculative
                 parallelization in shared-memory multiprocessors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "28",
  number =       "2",
  pages =        "13--24",
  month =        may,
  year =         "2000",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:49 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Reinhardt:2000:TFD,
  author =       "Steven K. Reinhardt and Shubhendu S. Mukherjee",
  title =        "Transient fault detection via simultaneous
                 multithreading",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "28",
  number =       "2",
  pages =        "25--36",
  month =        may,
  year =         "2000",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:49 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Jacobson:2000:TP,
  author =       "Quinn Jacobson and James E. Smith",
  title =        "Trace preconstruction",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "28",
  number =       "2",
  pages =        "37--46",
  month =        may,
  year =         "2000",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:49 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Rakvic:2000:CTM,
  author =       "Ryan Rakvic and Bryan Black and John Paul Shen",
  title =        "Completion time multiple branch prediction for
                 enhancing trace cache performance",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "28",
  number =       "2",
  pages =        "47--58",
  month =        may,
  year =         "2000",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:49 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Merten:2000:HMD,
  author =       "Matthew C. Merten and Andrew R. Trick and Erik M.
                 Nystrom and Ronald D. Barnes and Wen-mei W. Hmu",
  title =        "A hardware mechanism for dynamic extraction and
                 relayout of program hot spots",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "28",
  number =       "2",
  pages =        "59--70",
  month =        may,
  year =         "2000",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:49 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Oskin:2000:HCS,
  author =       "Mark Oskin and Frederic T. Chong and Matthew Farrens",
  title =        "{HLS}: combining statistical and symbolic simulation
                 to guide microprocessor designs",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "28",
  number =       "2",
  pages =        "71--82",
  month =        may,
  year =         "2000",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:49 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Brooks:2000:WFA,
  author =       "David Brooks and Vivek Tiwari and Margaret Martonosi",
  title =        "{Wattch}: a framework for architectural-level power
                 analysis and optimizations",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "28",
  number =       "2",
  pages =        "83--94",
  month =        may,
  year =         "2000",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:49 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Vijaykrishnan:2000:EDI,
  author =       "N. Vijaykrishnan and M. Kandemir and M. J. Irwin and
                 H. S. Kim and W. Ye",
  title =        "Energy-driven integrated hardware-software
                 optimizations using {SimplePower}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "28",
  number =       "2",
  pages =        "95--106",
  month =        may,
  year =         "2000",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:49 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hallnor:2000:FAS,
  author =       "Erik G. Hallnor and Steven K. Reinhardt",
  title =        "A fully associative software-managed cache design",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "28",
  number =       "2",
  pages =        "107--116",
  month =        may,
  year =         "2000",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:49 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Saulsbury:2000:RBT,
  author =       "Ashley Saulsbury and Fredrik Dahlgren and Per
                 Stenstr{\"o}m",
  title =        "Recency-based {TLB} preloading",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "28",
  number =       "2",
  pages =        "117--127",
  month =        may,
  year =         "2000",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:49 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Rixner:2000:MAS,
  author =       "Scott Rixner and William J. Dally and Ujval J. Kapasi
                 and Peter Mattson and John D. Owens",
  title =        "Memory access scheduling",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "28",
  number =       "2",
  pages =        "128--138",
  month =        may,
  year =         "2000",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:49 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Lai:2000:SAT,
  author =       "An-Chow Lai and Babak Falsafi",
  title =        "Selective, accurate, and timely self-invalidation
                 using last-touch prediction",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "28",
  number =       "2",
  pages =        "139--148",
  month =        may,
  year =         "2000",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:49 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Margolus:2000:EDA,
  author =       "Norman Margolus",
  title =        "An embedded {DRAM} architecture for large-scale
                 spatial-lattice computations",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "28",
  number =       "2",
  pages =        "149--160",
  month =        may,
  year =         "2000",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:49 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Mai:2000:SMM,
  author =       "Ken Mai and Tim Paaske and Nuwan Jayasena and Ron Ho
                 and William J. Dally and Mark Horowitz",
  title =        "Smart {Memories}: a modular reconfigurable
                 architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "28",
  number =       "2",
  pages =        "161--171",
  month =        may,
  year =         "2000",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:49 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Zilles:2000:UBS,
  author =       "Craig B. Zilles and Gurindar S. Sohi",
  title =        "Understanding the backward slices of performance
                 degrading instructions",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "28",
  number =       "2",
  pages =        "172--181",
  month =        may,
  year =         "2000",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:49 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Lepak:2000:VLS,
  author =       "Kevin M. Lepak and Mikko H. Lipasti",
  title =        "On the value locality of store instructions",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "28",
  number =       "2",
  pages =        "182--191",
  month =        may,
  year =         "2000",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:49 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Cvetanovic:2000:PAA,
  author =       "Zarka Cvetanovic and R. E. Kessler",
  title =        "Performance analysis of the {Alpha 21264}-based
                 {Compaq ES40} system",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "28",
  number =       "2",
  pages =        "192--202",
  month =        may,
  year =         "2000",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:49 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Faraboschi:2000:LTP,
  author =       "Paolo Faraboschi and Geoffrey Brown and Joseph A.
                 Fisher and Giuseppe Desoli and Fred Homewood",
  title =        "{Lx}: a technology platform for customizable {VLIW}
                 embedded processing",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "28",
  number =       "2",
  pages =        "203--213",
  month =        may,
  year =         "2000",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:49 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ranganathan:2000:RCT,
  author =       "Parthasarathy Ranganathan and Sarita Adve and Norman
                 P. Jouppi",
  title =        "Reconfigurable caches and their application to media
                 processing",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "28",
  number =       "2",
  pages =        "214--224",
  month =        may,
  year =         "2000",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:49 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ye:2000:CHP,
  author =       "Zhi Alex Ye and Andreas Moshovos and Scott Hauck and
                 Prithviraj Banerjee",
  title =        "{CHIMAERA}: a high-performance architecture with a
                 tightly-coupled reconfigurable functional unit",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "28",
  number =       "2",
  pages =        "225--235",
  month =        may,
  year =         "2000",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:49 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Henry:2000:CWW,
  author =       "Dana S. Henry and Bradley C. Kuszmaul and Gabriel H.
                 Loh and Rahul Sami",
  title =        "Circuits for wide-window superscalar processors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "28",
  number =       "2",
  pages =        "236--247",
  month =        may,
  year =         "2000",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:49 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Agarwal:2000:CRV,
  author =       "Vikas Agarwal and M. S. Hrishikesh and Stephen W.
                 Keckler and Doug Burger",
  title =        "Clock rate versus {IPC}: the end of the road for
                 conventional microarchitectures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "28",
  number =       "2",
  pages =        "248--259",
  month =        may,
  year =         "2000",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:49 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Smith:2000:VIS,
  author =       "J. E. Smith and Greg Faanes and Rabin Sugumar",
  title =        "Vector instruction set support for conditional
                 operations",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "28",
  number =       "2",
  pages =        "260--269",
  month =        may,
  year =         "2000",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:49 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Chou:2000:IPC,
  author =       "Yuan Chou and John Paul Shen",
  title =        "Instruction path coprocessors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "28",
  number =       "2",
  pages =        "270--281",
  month =        may,
  year =         "2000",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:49 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Barroso:2000:PSA,
  author =       "Luiz Andr{\'e} Barroso and Kourosh Gharachorloo and
                 Robert McNamara and Andreas Nowatzyk and Shaz Qadeer
                 and Barton Sano and Scott Smith and Robert Stets and
                 Ben Verghese",
  title =        "{Piranha}: a scalable architecture based on
                 single-chip multiprocessing",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "28",
  number =       "2",
  pages =        "282--293",
  month =        may,
  year =         "2000",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:49 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Radhakrishnan:2000:AIE,
  author =       "Ramesh Radhakrishnan and Deependra Talla and Lizy
                 Kurian John",
  title =        "Allowing for {ILP} in an embedded {Java} processor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "28",
  number =       "2",
  pages =        "294--305",
  month =        may,
  year =         "2000",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:49 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Bekerman:2000:ELA,
  author =       "Michael Bekerman and Adi Yoaz and Freddy Gabbay and
                 Stephan Jourdan and Maxim Kalaev and Ronny Ronen",
  title =        "Early load address resolution via register tracking",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "28",
  number =       "2",
  pages =        "306--315",
  month =        may,
  year =         "2000",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:49 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Cruz:2000:MBR,
  author =       "Jos{\'e}-Lorenzo Cruz and Antonio Gonz{\'a}lez and
                 Mateo Valero and Nigel P. Topham",
  title =        "Multiple-banked register file architectures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "28",
  number =       "2",
  pages =        "316--325",
  month =        may,
  year =         "2000",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:49 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Fernandez:2000:EPN,
  author =       "Benjam{\'\i}n Sahelices Fern{\'a}ndez and Diego R.
                 Llanos Ferraris and Agust{\'\i}n de Dios
                 Hern{\'a}ndez",
  title =        "Exploiting parallelism in a network of workstations
                 using {COMA-BC}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "28",
  number =       "3",
  pages =        "1--8",
  month =        jun,
  year =         "2000",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:59 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Thorson:2000:INb,
  author =       "Mark Thorson",
  title =        "{Internet} Nuggets",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "28",
  number =       "3",
  pages =        "9--13",
  month =        jun,
  year =         "2000",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:59 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Lafitte:2000:RDH,
  author =       "Jean-Louis Lafitte",
  title =        "Regarding a device to help battering the {RAM} wall",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "28",
  number =       "4",
  pages =        "4--10",
  month =        sep,
  year =         "2000",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:14 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Petit:2000:LSE,
  author =       "S. Petit and J. A. Gil and J. Sahuquillo and A. Pont",
  title =        "{LIDE}: a simulation environment for shared virtual
                 memory systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "28",
  number =       "4",
  pages =        "11--18",
  month =        sep,
  year =         "2000",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:14 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Schlosser:2000:DCS,
  author =       "Steven W. Schlosser and John Linwood Griffin and David
                 F. Nagle and Gregory R. Ganger",
  title =        "Designing computer systems with {MEMS}-based storage",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "28",
  number =       "5",
  pages =        "1--12",
  month =        dec,
  year =         "2000",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:22 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Gharachorloo:2000:ADA,
  author =       "Kourosh Gharachorloo and Madhu Sharma and Simon Steely
                 and Stephen {Van Doren}",
  title =        "Architecture and design of {AlphaServer GS320}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "28",
  number =       "5",
  pages =        "13--24",
  month =        dec,
  year =         "2000",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:22 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Martin:2000:TSA,
  author =       "Milo M. K. Martin and Daniel J. Sorin and Anastassia
                 Ailamaki and Alaa R. Alameldeen and Ross M. Dickson and
                 Carl J. Mauer and Kevin E. Moore and Manoj Plakal and
                 Mark D. Hill and David A. Wood",
  title =        "Timestamp snooping: an approach for extending {SMPs}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "28",
  number =       "5",
  pages =        "25--36",
  month =        dec,
  year =         "2000",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:22 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Nanda:2000:MPR,
  author =       "Ashwini Nanda and Kwok-Ken Mak and Krishnan Sugarvanam
                 and Ramendra K. Sahoo and Vijayaraghavan Soundararajan
                 and T. Basil Smith",
  title =        "{MemorIES3}: a programmable, real-time hardware
                 emulation tool for multiprocessor server design",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "28",
  number =       "5",
  pages =        "37--48",
  month =        dec,
  year =         "2000",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:22 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Gibson:2000:FVS,
  author =       "Jeff Gibson and Robert Kunz and David Ofelt and Mark
                 Horowitz and John Hennessy and Mark Heinrich",
  title =        "{FLASH} vs. {(Simulated) FLASH}: closing the
                 simulation loop",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "28",
  number =       "5",
  pages =        "49--58",
  month =        dec,
  year =         "2000",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:22 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Chou:2000:UML,
  author =       "Andy Chou and Benjamin Chelf and Dawson Engler and
                 Mark Heinrich",
  title =        "Using meta-level compilation to check {FLASH} protocol
                 code",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "28",
  number =       "5",
  pages =        "59--70",
  month =        dec,
  year =         "2000",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:22 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Bhoedjang:2000:EDA,
  author =       "Raoul A. F. Bhoedjang and Kees Verstoep and Tim
                 R{\"u}hl and Henri E. Bal and Rutger F. H. Hofman",
  title =        "Evaluating design alternatives for reliable
                 communication on high-speed networks",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "28",
  number =       "5",
  pages =        "71--81",
  month =        dec,
  year =         "2000",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:22 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Mattson:2000:CS,
  author =       "Peter Mattson and William J. Dally and Scott Rixner
                 and Ujval J. Kapasi and John D. Owens",
  title =        "Communication scheduling",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "28",
  number =       "5",
  pages =        "82--92",
  month =        dec,
  year =         "2000",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:22 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hill:2000:SAD,
  author =       "Jason Hill and Robert Szewczyk and Alec Woo and Seth
                 Hollar and David Culler and Kristofer Pister",
  title =        "System architecture directions for networked sensors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "28",
  number =       "5",
  pages =        "93--104",
  month =        dec,
  year =         "2000",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:22 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Lebeck:2000:PAP,
  author =       "Alvin R. Lebeck and Xiaobo Fan and Heng Zeng and Carla
                 Ellis",
  title =        "Power aware page allocation",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "28",
  number =       "5",
  pages =        "105--116",
  month =        dec,
  year =         "2000",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:22 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Berger:2000:HSM,
  author =       "Emery D. Berger and Kathryn S. McKinley and Robert D.
                 Blumofe and Paul R. Wilson",
  title =        "{Hoard}: a scalable memory allocator for multithreaded
                 applications",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "28",
  number =       "5",
  pages =        "117--128",
  month =        dec,
  year =         "2000",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:22 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Flautner:2000:TLP,
  author =       "Kristi{\'a}n Flautner and Rich Uhlig and Steve
                 Reinhardt and Trevor Mudge",
  title =        "Thread-level parallelism and interactive performance
                 of desktop applications",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "28",
  number =       "5",
  pages =        "129--138",
  month =        dec,
  year =         "2000",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:22 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kawahito:2000:ENP,
  author =       "Motohiro Kawahito and Hideaki Komatsu and Toshio
                 Nakatani",
  title =        "Effective null pointer check elimination utilizing
                 hardware trap",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "28",
  number =       "5",
  pages =        "139--149",
  month =        dec,
  year =         "2000",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:22 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Zhang:2000:FVL,
  author =       "Youtao Zhang and Jun Yang and Rajiv Gupta",
  title =        "Frequent value locality and value-centric data cache
                 design",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "28",
  number =       "5",
  pages =        "150--159",
  month =        dec,
  year =         "2000",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:22 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Burrows:2000:EFV,
  author =       "M. Burrows and U. Erlingson and S-T. A. Leung and M.
                 T. Vandevoorde and C. A. Waldspurger and K. Walker and
                 W. E. Weihl",
  title =        "Efficient and flexible value sampling",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "28",
  number =       "5",
  pages =        "160--167",
  month =        dec,
  year =         "2000",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:22 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Thekkath:2000:ASC,
  author =       "David Lie Chandramohan Thekkath and Mark Mitchell and
                 Patrick Lincoln and Dan Boneh and John Mitchell and
                 Mark Horowitz",
  title =        "Architectural support for copy and tamper resistant
                 software",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "28",
  number =       "5",
  pages =        "168--177",
  month =        dec,
  year =         "2000",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:22 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Burke:2000:ASF,
  author =       "Jerome Burke and John McDonald and Todd Austin",
  title =        "Architectural support for fast symmetric-key
                 cryptography",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "28",
  number =       "5",
  pages =        "178--189",
  month =        dec,
  year =         "2000",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:22 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kubiatowicz:2000:OAG,
  author =       "John Kubiatowicz and David Bindel and Yan Chen and
                 Steven Czerwinski and Patrick Eaton and Dennis Geels
                 and Ramakrishna Gummadi and Sean Rhea and Hakim
                 Weatherspoon and Chris Wells and Ben Zhao",
  title =        "{OceanStore}: an architecture for global-scale
                 persistent storage",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "28",
  number =       "5",
  pages =        "190--201",
  month =        dec,
  year =         "2000",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:22 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Duesterwald:2000:SPH,
  author =       "Evelyn Duesterwald and Vasanth Bala",
  title =        "Software profiling for hot path prediction: less is
                 more",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "28",
  number =       "5",
  pages =        "202--211",
  month =        dec,
  year =         "2000",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:22 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Zahir:2000:CCD,
  author =       "Rumi Zahir and Jonathan Ross and Dale Morris and Drew
                 Hess",
  title =        "{OS} and compiler considerations in the design of the
                 {IA-64} architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "28",
  number =       "5",
  pages =        "212--221",
  month =        dec,
  year =         "2000",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:22 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Connors:2000:HSD,
  author =       "Daniel A. Connors and Hillery C. Hunter and Ben-Chung
                 Cheng and Wen-mei W. Hwu",
  title =        "Hardware support for dynamic activation of
                 compiler-directed computation reuse",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "28",
  number =       "5",
  pages =        "222--233",
  month =        dec,
  year =         "2000",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:22 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Snavely:2000:SJS,
  author =       "Allan Snavely and Dean M. Tullsen",
  title =        "Symbiotic job scheduling for a simultaneous
                 multithreaded processor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "28",
  number =       "5",
  pages =        "234--244",
  month =        dec,
  year =         "2000",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:22 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Redstone:2000:AOS,
  author =       "Joshua A. Redstone and Susan J. Eggers and Henry M.
                 Levy",
  title =        "An analysis of operating system behavior on a
                 simultaneous multithreaded architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "28",
  number =       "5",
  pages =        "245--256",
  month =        dec,
  year =         "2000",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:22 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Sundaramoorthy:2000:SPI,
  author =       "Karthik Sundaramoorthy and Zach Purser and Eric
                 Rotenburg",
  title =        "Slipstream processors: improving both performance and
                 fault tolerance",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "28",
  number =       "5",
  pages =        "257--268",
  month =        dec,
  year =         "2000",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:22 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Wilkes:2001:MGF,
  author =       "Maurice V. Wilkes",
  title =        "The memory gap and the future of high performance
                 memories",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "29",
  number =       "1",
  pages =        "2--7",
  month =        mar,
  year =         "2001",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:36 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Manjikian:2001:MESa,
  author =       "Naraig Manjikian",
  title =        "Multiprocessor enhancements of the {SimpleScalar} tool
                 set",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "29",
  number =       "1",
  pages =        "8--15",
  month =        mar,
  year =         "2001",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:36 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Wang:2001:MAH,
  author =       "Frank Wang",
  title =        "A modified architecture for high-density {MRAM}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "29",
  number =       "1",
  pages =        "16--22",
  month =        mar,
  year =         "2001",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:36 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Altman:2001:WWB,
  author =       "Erik R. Altman and David Kaeli",
  title =        "{WBT-2000}: {Workshop on Binary Translation 2000}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "29",
  number =       "1",
  pages =        "23--25",
  month =        mar,
  year =         "2001",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:36 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Srivastava:2001:EOB,
  author =       "Amitabh Srivastava",
  title =        "Emerging opportunities for binary tools",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "29",
  number =       "1",
  pages =        "26--26",
  month =        mar,
  year =         "2001",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:36 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Cain:2001:DBT,
  author =       "Harold W. Cain and Kevin M. Lepak and Mikko H.
                 Lipasti",
  title =        "A dynamic binary translation approach to architectural
                 simulation",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "29",
  number =       "1",
  pages =        "27--36",
  month =        mar,
  year =         "2001",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:36 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hilgendorf:2001:ITE,
  author =       "Rolf Hilgendorf and Wolfram Sauer",
  title =        "Instruction translation for an experimental {S/390}
                 processor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "29",
  number =       "1",
  pages =        "37--42",
  month =        mar,
  year =         "2001",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:36 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ronsse:2001:JRJ,
  author =       "Michiel Ronsse and Koen {De Bosschere}",
  title =        "{JiTI}: a robust just in time instrumentation
                 technique",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "29",
  number =       "1",
  pages =        "43--54",
  month =        mar,
  year =         "2001",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:36 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ung:2001:OHP,
  author =       "David Ung and Cristina Cifuentes",
  title =        "Optimising hot paths in a dynamic binary translator",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "29",
  number =       "1",
  pages =        "55--65",
  month =        mar,
  year =         "2001",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:36 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Gschwind:2001:OPE,
  author =       "Michael Gschwind and Erik Altman",
  title =        "Optimization and precise exceptions in dynamic
                 compilation",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "29",
  number =       "1",
  pages =        "66--74",
  month =        mar,
  year =         "2001",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:36 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Thorson:2001:INa,
  author =       "Mark Thorson",
  title =        "{Internet} Nuggets",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "29",
  number =       "1",
  pages =        "75--77",
  month =        mar,
  year =         "2001",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:36 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Zilles:2001:EBP,
  author =       "Craig Zilles and Gurindar Sohi",
  title =        "Execution-based prediction using speculative slices",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "29",
  number =       "2",
  pages =        "2--13",
  month =        may,
  year =         "2001",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:50 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Collins:2001:SPL,
  author =       "Jamison D. Collins and Hong Wang and Dean M. Tullsen
                 and Christopher Hughes and Yong-Fong Lee and Dan Lavery
                 and John P. Shen",
  title =        "Speculative precomputation: long-range prefetching of
                 delinquent loads",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "29",
  number =       "2",
  pages =        "14--25",
  month =        may,
  year =         "2001",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:50 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Balasubramonian:2001:DAP,
  author =       "Rajeev Balasubramonian and Sandhya Dwarkadas and David
                 H. Albonesi",
  title =        "Dynamically allocating processor resources between
                 nearby and distant {ILP}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "29",
  number =       "2",
  pages =        "26--37",
  month =        may,
  year =         "2001",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:50 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Luk:2001:TML,
  author =       "Chi-Keung Luk",
  title =        "Tolerating memory latency through software-controlled
                 pre-execution in simultaneous multithreading
                 processors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "29",
  number =       "2",
  pages =        "40--51",
  month =        may,
  year =         "2001",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:50 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Annavaram:2001:DPD,
  author =       "Murali Annavaram and Jignesh M. Patel and Edward S.
                 Davidson",
  title =        "Data prefetching by dependence graph precomputation",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "29",
  number =       "2",
  pages =        "52--61",
  month =        may,
  year =         "2001",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:50 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Cuppu:2001:CLS,
  author =       "Vinodh Cuppu and Bruce Jacob",
  title =        "Concurrency, latency, or system overhead: which has
                 the largest impact on uniprocessor {DRAM}-system
                 performance?",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "29",
  number =       "2",
  pages =        "62--71",
  month =        may,
  year =         "2001",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:50 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Fields:2001:FPP,
  author =       "Brian Fields and Shai Rubin and Rastislav Bod{\'\i}k",
  title =        "Focusing processor policies via critical-path
                 prediction",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "29",
  number =       "2",
  pages =        "74--85",
  month =        may,
  year =         "2001",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:50 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Sherwood:2001:ADF,
  author =       "Timothy Sherwood and Brad Calder",
  title =        "Automated design of finite state machine predictors
                 for customized processors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "29",
  number =       "2",
  pages =        "86--97",
  month =        may,
  year =         "2001",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:50 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Wu:2001:BER,
  author =       "Youfeng Wu and Dong-Yuan Chen and Jesse Fang",
  title =        "Better exploration of region-level value locality with
                 integrated computation reuse and value prediction",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "29",
  number =       "2",
  pages =        "98--108",
  month =        may,
  year =         "2001",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:50 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Wu:2001:CFF,
  author =       "Lisa Wu and Chris Weaver and Todd Austin",
  title =        "{CryptoManiac}: a fast flexible architecture for
                 secure communication",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "29",
  number =       "2",
  pages =        "110--119",
  month =        may,
  year =         "2001",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:50 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Yum:2001:QPC,
  author =       "Ki Hwan Yum and Eun Jung Kim and Chita R. Das",
  title =        "{QoS} provisioning in clusters: an investigation of
                 {Router} and {NIC} design",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "29",
  number =       "2",
  pages =        "120--129",
  month =        may,
  year =         "2001",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:50 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Srinivasan:2001:LVC,
  author =       "Srikanth T. Srinivasan and Roy Dz-ching Ju and Alvin
                 R. Lebeck and Chris Wilkerson",
  title =        "Locality vs. criticality",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "29",
  number =       "2",
  pages =        "132--143",
  month =        may,
  year =         "2001",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:50 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Lai:2001:DBP,
  author =       "An-Chow Lai and Cem Fide and Babak Falsafi",
  title =        "Dead-block prediction \& dead-block correlating
                 prefetchers",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "29",
  number =       "2",
  pages =        "144--154",
  month =        may,
  year =         "2001",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:50 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ramirez:2001:CLO,
  author =       "Alex Ramirez and Luiz Andr{\'e} Barroso and Kourosh
                 Gharachorloo and Robert Cohn and Josep Larriba-Pey and
                 P. Geoffrey Lowney and Mateo Valero",
  title =        "Code layout optimizations for transaction processing
                 workloads",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "29",
  number =       "2",
  pages =        "155--164",
  month =        may,
  year =         "2001",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:50 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Niemier:2001:EEW,
  author =       "Michael Thaddeus Niemier and Peter M. Kogge",
  title =        "Exploring and exploiting wire-level pipelining in
                 emerging technologies",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "29",
  number =       "2",
  pages =        "166--177",
  month =        may,
  year =         "2001",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:50 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Goldstein:2001:NSC,
  author =       "Seth Copen Goldstein and Mihai Budiu",
  title =        "{NanoFabrics}: spatial computing using molecular
                 electronics",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "29",
  number =       "2",
  pages =        "178--191",
  month =        may,
  year =         "2001",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:50 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Lie:2001:SME,
  author =       "David Lie and Andy Chou and Dawson Engler and David L.
                 Dill",
  title =        "A simple method for extracting models for protocol
                 code",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "29",
  number =       "2",
  pages =        "192--203",
  month =        may,
  year =         "2001",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:50 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Prvulovic:2001:RAB,
  author =       "Milos Prvulovic and Mar{\'\i}a Jes{\'u}s Garzar{\'a}n
                 and Lawrence Rauchwerger and Josep Torrellas",
  title =        "Removing architectural bottlenecks to the scalability
                 of speculative parallelization",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "29",
  number =       "2",
  pages =        "204--215",
  month =        may,
  year =         "2001",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:50 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Bahar:2001:PER,
  author =       "R. Iris Bahar and Srilatha Manne",
  title =        "Power and energy reduction via pipeline balancing",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "29",
  number =       "2",
  pages =        "218--229",
  month =        may,
  year =         "2001",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:50 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Folegnani:2001:EEI,
  author =       "Daniele Folegnani and Antonio Gonz{\'a}lez",
  title =        "Energy-effective issue logic",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "29",
  number =       "2",
  pages =        "230--239",
  month =        may,
  year =         "2001",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:50 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kaxiras:2001:CDE,
  author =       "Stefanos Kaxiras and Zhigang Hu and Margaret
                 Martonosi",
  title =        "Cache decay: exploiting generational behavior to
                 reduce cache leakage power",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "29",
  number =       "2",
  pages =        "240--251",
  month =        may,
  year =         "2001",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:50 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hughes:2001:VEM,
  author =       "Christopher J. Hughes and Praful Kaul and Sarita V.
                 Adve and Rohit Jain and Chanik Park and Jayanth
                 Srinivasan",
  title =        "Variability in the execution of multimedia
                 applications and implications for architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "29",
  number =       "2",
  pages =        "254--265",
  month =        may,
  year =         "2001",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:50 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Sastry:2001:RPS,
  author =       "S. Subramanya Sastry and Rastislav Bod{\'\i}k and
                 James E. Smith",
  title =        "Rapid profiling via stratified sampling",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "29",
  number =       "2",
  pages =        "278--289",
  month =        may,
  year =         "2001",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:50 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Zilles:2001:BHC,
  author =       "Craig B. Zilles",
  title =        "Benchmark health considered harmful",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "29",
  number =       "3",
  pages =        "4--5",
  month =        jun,
  year =         "2001",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:00 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Thornock:2001:NTC,
  author =       "Niki C. Thornock and J. Kelly Flanagan",
  title =        "A national trace collection and distribution
                 resource",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "29",
  number =       "3",
  pages =        "6--10",
  month =        jun,
  year =         "2001",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:00 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Thorson:2001:INb,
  author =       "Mark Thorson",
  title =        "{Internet} Nuggets",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "29",
  number =       "3",
  pages =        "11--15",
  month =        jun,
  year =         "2001",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:00 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Manjikian:2001:MESb,
  author =       "Naraig Manjikian",
  title =        "More enhancements of the {SimpleScalar} tool set",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "29",
  number =       "4",
  pages =        "5--12",
  month =        sep,
  year =         "2001",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:15 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Cantin:2001:CPS,
  author =       "Jason F. Cantin and Mark D. Hill",
  title =        "Cache performance for selected {SPEC CPU2000}
                 benchmarks",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "29",
  number =       "4",
  pages =        "13--18",
  month =        sep,
  year =         "2001",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:15 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Zhang:2001:PLA,
  author =       "Jinsuo Zhang",
  title =        "The predictability of load address",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "29",
  number =       "4",
  pages =        "19--28",
  month =        sep,
  year =         "2001",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:15 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Thorson:2001:INc,
  author =       "Mark Thorson",
  title =        "{Internet} Nuggets",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "29",
  number =       "4",
  pages =        "29--31",
  month =        sep,
  year =         "2001",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:15 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{El-Kharashi:2001:ATA,
  author =       "M. Watheq El-Kharashi and Fayez Elguibaly and Kin F.
                 Li",
  title =        "Adapting {Tomasulo}'s algorithm for bytecode folding
                 based {Java} processors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "29",
  number =       "5",
  pages =        "1--8",
  month =        dec,
  year =         "2001",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:22 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Bartolini:2001:PAC,
  author =       "S. Bartolini and R. Giorgi and J. Protic and C. A.
                 Prete and M. Valero",
  title =        "Parallel architecture and compilation techniques:
                 selection of workshop papers, {Guest Editors}'
                 introduction",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "29",
  number =       "5",
  pages =        "9--12",
  month =        dec,
  year =         "2001",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:22 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Acquaviva:2001:ECE,
  author =       "Andrea Acquaviva and Luca Benini and Bruno Ricc{\'o}",
  title =        "Energy characterization of embedded real-time
                 operating systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "29",
  number =       "5",
  pages =        "13--18",
  month =        dec,
  year =         "2001",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:22 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Moncusi:2001:IES,
  author =       "M. Angels Moncusi and Alex Arenas and Jesus Labarta",
  title =        "Improving energy saving in hard real time systems via
                 a modified dual priority scheduling",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "29",
  number =       "5",
  pages =        "19--24",
  month =        dec,
  year =         "2001",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:22 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Vahid:2001:PCP,
  author =       "Frank Vahid and Rilesh Patel and Greg Stitt",
  title =        "Propagating constants past software to hardware
                 peripherals in fixed-application embedded systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "29",
  number =       "5",
  pages =        "25--30",
  month =        dec,
  year =         "2001",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:22 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Aslot:2001:PCS,
  author =       "Vishal Aslot and Rudolf Eigenmann",
  title =        "Performance characteristics of the {SPEC OMP2001}
                 benchmarks",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "29",
  number =       "5",
  pages =        "31--40",
  month =        dec,
  year =         "2001",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:22 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Bull:2001:MSO,
  author =       "J. Mark Bull and Darragh O'Neill",
  title =        "A microbenchmark suite for {OpenMP 2.0}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "29",
  number =       "5",
  pages =        "41--48",
  month =        dec,
  year =         "2001",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:22 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Nikolopoulos:2001:EMA,
  author =       "D. S. Nikolopoulos and E. Artiaga and E. Ayguad{\'e}
                 and J. Labarta",
  title =        "Exploiting memory affinity in {OpenMP} through
                 schedule reuse",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "29",
  number =       "5",
  pages =        "49--55",
  month =        dec,
  year =         "2001",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:22 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Sung:2001:MDA,
  author =       "Michael Sung and Ronny Krashinsky and Krste
                 Asanovi{\'c}",
  title =        "Multithreading decoupled architectures for
                 complexity-effective general purpose computing",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "29",
  number =       "5",
  pages =        "56--61",
  month =        dec,
  year =         "2001",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:22 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Talla:2001:MDA,
  author =       "Deependra Talla and Lizy K. John",
  title =        "{MediaBreeze}: a decoupled architecture for
                 accelerating multimedia applications",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "29",
  number =       "5",
  pages =        "62--67",
  month =        dec,
  year =         "2001",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:22 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Nakajima:2001:MCS,
  author =       "Tatsuo Nakajima",
  title =        "A middleware component supporting flexible user
                 interaction for networked home appliances",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "29",
  number =       "5",
  pages =        "68--75",
  month =        dec,
  year =         "2001",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:22 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Touzet:2001:SSE,
  author =       "David Touzet and Jean-Marc Menaud and Fr{\'e}d{\'e}ric
                 Weis and Paul Couderc and Michel Ban{\^a}tre",
  title =        "{SIDE} surfer: enriching casual meetings with
                 spontaneous information gathering",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "29",
  number =       "5",
  pages =        "76--83",
  month =        dec,
  year =         "2001",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:22 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Altman:2001:WBT,
  author =       "Erik R. Altman and David R. Kaeli",
  title =        "{Workshop on Binary Translation 2001}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "29",
  number =       "5",
  pages =        "84--85",
  month =        dec,
  year =         "2001",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:22 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Thorson:2001:INd,
  author =       "Mark Thorson",
  title =        "{Internet} Nuggets",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "29",
  number =       "5",
  pages =        "86--90",
  month =        dec,
  year =         "2001",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:22 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Desikan:2002:EME,
  author =       "Rajagopalan Desikan and Doug Burger and Stephen W.
                 Keckler and Llorenc Cruz and Fernando Latorre and
                 Antonio Gonz{\'a}lez and Mateo Valero",
  title =        "Errata on {``Measuring Experimental Error in
                 Microprocessor Simulation''}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "30",
  number =       "1",
  pages =        "2--4",
  month =        mar,
  year =         "2002",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:36 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Chang:2002:ATI,
  author =       "Fu-Chi Chang and Chia-Jiu Wang",
  title =        "Architectural tradeoff in implementing {RSA}
                 processors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "30",
  number =       "1",
  pages =        "5--11",
  month =        mar,
  year =         "2002",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:36 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Uht:2002:DEE,
  author =       "Augustus K. Uht",
  title =        "Disjoint {Eager Execution}: what it is \slash what it
                 is not",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "30",
  number =       "1",
  pages =        "12--14",
  month =        mar,
  year =         "2002",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:36 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Thorson:2002:INa,
  author =       "Mark Thorson",
  title =        "{Internet} Nuggets",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "30",
  number =       "1",
  pages =        "15--21",
  month =        mar,
  year =         "2002",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:36 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hartstein:2002:OPD,
  author =       "A. Hartstein and Thomas R. Puzak",
  title =        "The optimum pipeline depth for a microprocessor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "30",
  number =       "2",
  pages =        "7--13",
  month =        may,
  year =         "2002",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:50 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hrishikesh:2002:OLD,
  author =       "M. S. Hrishikesh and Doug Burger and Norman P. Jouppi
                 and Stephen W. Keckler and Keith I. Farkas and
                 Premkishore Shivakumar",
  title =        "The optimal logic depth per pipeline stage is 6 to 8
                 {FO4} inverter delays",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "30",
  number =       "2",
  pages =        "14--24",
  month =        may,
  year =         "2002",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:50 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Sprangle:2002:IPP,
  author =       "Eric Sprangle and Doug Carmean",
  title =        "Increasing processor performance by implementing
                 deeper pipelines",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "30",
  number =       "2",
  pages =        "25--34",
  month =        may,
  year =         "2002",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:50 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ernst:2002:EDS,
  author =       "Dan Ernst and Todd Austin",
  title =        "Efficient dynamic scheduling through tag elimination",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "30",
  number =       "2",
  pages =        "37--46",
  month =        may,
  year =         "2002",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:50 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Fields:2002:SMP,
  author =       "Brian Fields and Rastislav Bod{\'\i}k and Mark D.
                 Hill",
  title =        "{Slack}: maximizing performance under technological
                 constraints",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "30",
  number =       "2",
  pages =        "47--58",
  month =        may,
  year =         "2002",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:50 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Lebeck:2002:LFI,
  author =       "Alvin R. Lebeck and Jinson Koppanalil and Tong Li and
                 Jaidev Patwardhan and Eric Rotenberg",
  title =        "A large, fast instruction window for tolerating cache
                 misses",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "30",
  number =       "2",
  pages =        "59--70",
  month =        may,
  year =         "2002",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:50 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kim:2002:ISM,
  author =       "Ho-Seop Kim and James E. Smith",
  title =        "An instruction set and microarchitecture for
                 instruction level distributed processing",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "30",
  number =       "2",
  pages =        "71--81",
  month =        may,
  year =         "2002",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:50 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Vijaykumar:2002:TFR,
  author =       "T. N. Vijaykumar and Irith Pomeranz and Karl Cheng",
  title =        "Transient-fault recovery using simultaneous
                 multithreading",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "30",
  number =       "2",
  pages =        "87--98",
  month =        may,
  year =         "2002",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:50 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Mukherjee:2002:DDE,
  author =       "Shubhendu S. Mukherjee and Michael Kontz and Steven K.
                 Reinhardt",
  title =        "Detailed design and evaluation of redundant
                 multithreading alternatives",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "30",
  number =       "2",
  pages =        "99--110",
  month =        may,
  year =         "2002",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:50 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Prvulovic:2002:RCE,
  author =       "Milos Prvulovic and Zheng Zhang and Josep Torrellas",
  title =        "{ReVive}: cost-effective architectural support for
                 rollback recovery in shared-memory multiprocessors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "30",
  number =       "2",
  pages =        "111--122",
  month =        may,
  year =         "2002",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:50 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Sorin:2002:SIA,
  author =       "Daniel J. Sorin and Milo M. K. Martin and Mark D. Hill
                 and David A. Wood",
  title =        "{SafetyNet}: improving the availability of shared
                 memory multiprocessors with global checkpoint\slash
                 recovery",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "30",
  number =       "2",
  pages =        "123--134",
  month =        may,
  year =         "2002",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:50 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Heo:2002:DFG,
  author =       "Seongmoo Heo and Kenneth Barr and Mark Hampton and
                 Krste Asanovi{\'c}",
  title =        "Dynamic fine-grain leakage reduction using
                 leakage-biased bitlines",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "30",
  number =       "2",
  pages =        "137--147",
  month =        may,
  year =         "2002",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:50 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Flautner:2002:DCS,
  author =       "Kriszti{\'a}n Flautner and Nam Sung Kim and Steve
                 Martin and David Blaauw and Trevor Mudge",
  title =        "Drowsy caches: simple techniques for reducing leakage
                 power",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "30",
  number =       "2",
  pages =        "148--157",
  month =        may,
  year =         "2002",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:50 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Iyer:2002:PPE,
  author =       "Anoop Iyer and Diana Marculescu",
  title =        "Power and performance evaluation of globally
                 asynchronous locally synchronous processors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "30",
  number =       "2",
  pages =        "158--168",
  month =        may,
  year =         "2002",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:50 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Solihin:2002:UUL,
  author =       "Yan Solihin and Jaejin Lee and Josep Torrellas",
  title =        "Using a user-level memory thread for correlation
                 prefetching",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "30",
  number =       "2",
  pages =        "171--182",
  month =        may,
  year =         "2002",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:50 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Lewis:2002:AIM,
  author =       "Jarrod A. Lewis and Bryan Black and Mikko H. Lipasti",
  title =        "Avoiding initialization misses to the heap",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "30",
  number =       "2",
  pages =        "183--194",
  month =        may,
  year =         "2002",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:50 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kandiraju:2002:GDT,
  author =       "Gokul B. Kandiraju and Anand Sivasubramaniam",
  title =        "Going the distance for {TLB} prefetching: an
                 application-driven study",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "30",
  number =       "2",
  pages =        "195--206",
  month =        may,
  year =         "2002",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:50 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hu:2002:TMS,
  author =       "Zhigang Hu and Stefanos Kaxiras and Margaret
                 Martonosi",
  title =        "Timekeeping in the memory system: predicting and
                 optimizing memory behavior",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "30",
  number =       "2",
  pages =        "209--220",
  month =        may,
  year =         "2002",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:50 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kim:2002:IOD,
  author =       "Ilhyun Kim and Mikko H. Lipasti",
  title =        "Implementing optimizations at decode time",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "30",
  number =       "2",
  pages =        "221--232",
  month =        may,
  year =         "2002",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:50 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Dhodapkar:2002:MMC,
  author =       "Ashutosh S. Dhodapkar and James E. Smith",
  title =        "Managing multi-configuration hardware via dynamic
                 working set analysis",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "30",
  number =       "2",
  pages =        "233--244",
  month =        may,
  year =         "2002",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:50 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Buonadonna:2002:QPI,
  author =       "Philip Buonadonna and David Culler",
  title =        "Queue pair {IP}: a hybrid architecture for system area
                 networks",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "30",
  number =       "2",
  pages =        "247--256",
  month =        may,
  year =         "2002",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:50 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Zhou:2002:EVC,
  author =       "Yuanyuan Zhou and Angelos Bilas and Suresh Jagannathan
                 and Cezary Dubnicki and James F. Philbin and Kai Li",
  title =        "Experiences with {VI} communication for database
                 storage",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "30",
  number =       "2",
  pages =        "257--268",
  month =        may,
  year =         "2002",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:50 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Pajuelo:2002:SDV,
  author =       "Alex Pajuelo and Antonio Gonz{\'a}lez and Mateo
                 Valero",
  title =        "Speculative dynamic vectorization",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "30",
  number =       "2",
  pages =        "271--280",
  month =        may,
  year =         "2002",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:50 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Espasa:2002:TVE,
  author =       "Roger Espasa and Federico Ardanaz and Joel Emer and
                 Stephen Felix and Julio Gago and Roger Gramunt and
                 Isaac Hernandez and Toni Juan and Geoff Lowney and
                 Matthew Mattina and Andr{\'e} Seznec",
  title =        "{Tarantula}: a vector extension to the {Alpha}
                 architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "30",
  number =       "2",
  pages =        "281--292",
  month =        may,
  year =         "2002",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:50 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Seznec:2002:DTA,
  author =       "Andr{\'e} Seznec and Stephen Felix and Venkata
                 Krishnan and Yiannakis Sazeides",
  title =        "Design tradeoffs for the {Alpha EV8} conditional
                 branch predictor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "30",
  number =       "2",
  pages =        "295--306",
  month =        may,
  year =         "2002",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:50 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Chappell:2002:DPB,
  author =       "Robert S. Chappell and Francis Tseng and Adi Yoaz and
                 Yale N. Patt",
  title =        "Difficult-path branch prediction using subordinate
                 microthreads",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "30",
  number =       "2",
  pages =        "307--317",
  month =        may,
  year =         "2002",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:50 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Raasch:2002:SIQ,
  author =       "Steven E. Raasch and Nathan L. Binkert and Steven K.
                 Reinhardt",
  title =        "A scalable instruction queue design using dependence
                 chains",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "30",
  number =       "2",
  pages =        "318--329",
  month =        may,
  year =         "2002",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:50 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Steele:2002:OHH,
  author =       "Ken Steele and Jason Waterman and Eugene Weinstein",
  title =        "The {Oxygen H21} handheld",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "30",
  number =       "3",
  pages =        "3--4",
  month =        jun,
  year =         "2002",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:00 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Keen:2002:HSC,
  author =       "Diana Keen and Frederic T. Chong",
  title =        "Hardware-software co-design of embedded
                 sensor-actuator networks",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "30",
  number =       "3",
  pages =        "5--6",
  month =        jun,
  year =         "2002",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:00 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kondo:2002:SCC,
  author =       "Masaaki Kondo and Motonobu Fujita and Hiroshi
                 Nakamura",
  title =        "Software-controlled on-chip memory for
                 high-performance and low-power computing",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "30",
  number =       "3",
  pages =        "7--8",
  month =        jun,
  year =         "2002",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:00 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Sahoo:2002:SHA,
  author =       "Ramendra K. Sahoo and Myung Bae and Jose Moreira",
  title =        "Semi-hierarchical approach for reliability,
                 availability, and serviceability of cellular systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "30",
  number =       "3",
  pages =        "9--10",
  month =        jun,
  year =         "2002",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:00 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Eberle:2002:MDC,
  author =       "Hans Eberle",
  title =        "Monitoring and diagnosing computer systems by radio
                 communication",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "30",
  number =       "3",
  pages =        "11--12",
  month =        jun,
  year =         "2002",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:00 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Thies:2002:CML,
  author =       "William Thies and Michal Karczmarek and Michael Gordon
                 and David Maze and Jeremy Wong and Henry Hoffmann and
                 Matthew Brown and Saman Amarasinghe",
  title =        "A common machine language for grid-based
                 architectures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "30",
  number =       "3",
  pages =        "13--14",
  month =        jun,
  year =         "2002",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:00 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Wang:2002:NAM,
  author =       "Frank Wang and Na Helian and Farhi Marir",
  title =        "A novel associative memory architecture for quick
                 matching",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "30",
  number =       "3",
  pages =        "15--16",
  month =        jun,
  year =         "2002",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:00 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Parker:2002:CUL,
  author =       "Mike Parker",
  title =        "A case for user-level interrupts",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "30",
  number =       "3",
  pages =        "17--18",
  month =        jun,
  year =         "2002",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:00 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Burtscher:2002:IIF,
  author =       "Martin Burtscher",
  title =        "An improved index function for {(D)FCM} predictors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "30",
  number =       "3",
  pages =        "19--24",
  month =        jun,
  year =         "2002",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:00 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Thorson:2002:INb,
  author =       "Mark Thorson",
  title =        "{Internet} Nuggets",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "30",
  number =       "3",
  pages =        "25--26",
  month =        jun,
  year =         "2002",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:00 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Gomez:2002:ASA,
  author =       "I. G{\`o}mez and L. Pi{\~n}uel and M. Prieto and F.
                 Tirado",
  title =        "Analysis of simulation-adapted {SPEC 2000}
                 benchmarks",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "30",
  number =       "4",
  pages =        "4--10",
  month =        sep,
  year =         "2002",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:15 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Thorson:2002:INc,
  author =       "Mark Thorson",
  title =        "{Internet} Nuggets",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "30",
  number =       "4",
  pages =        "11--16",
  month =        sep,
  year =         "2002",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:15 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Estrin:2002:KAS,
  author =       "Deborah Estrin",
  title =        "Keynote address: {Sensor} network research: emerging
                 challenges for architecture, systems, and languages",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "30",
  number =       "5",
  pages =        "1--4",
  month =        dec,
  year =         "2002",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:23 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Rajwar:2002:TLF,
  author =       "Ravi Rajwar and James R. Goodman",
  title =        "Transactional lock-free execution of lock-based
                 programs",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "30",
  number =       "5",
  pages =        "5--17",
  month =        dec,
  year =         "2002",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:23 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Martinez:2002:SSA,
  author =       "Jos{\'e} F. Mart{\'\i}nez and Josep Torrellas",
  title =        "Speculative synchronization: applying thread-level
                 speculation to explicitly parallel applications",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "30",
  number =       "5",
  pages =        "18--29",
  month =        dec,
  year =         "2002",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:23 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Lepak:2002:TSS,
  author =       "Kevin M. Lepak and Mikko H. Lipasti",
  title =        "Temporally silent stores",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "30",
  number =       "5",
  pages =        "30--41",
  month =        dec,
  year =         "2002",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:23 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Sherwood:2002:ACL,
  author =       "Timothy Sherwood and Erez Perelman and Greg Hamerly
                 and Brad Calder",
  title =        "Automatically characterizing large scale program
                 behavior",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "30",
  number =       "5",
  pages =        "45--57",
  month =        dec,
  year =         "2002",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:23 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ogata:2002:BFO,
  author =       "Kazunori Ogata and Hideaki Komatsu and Toshio
                 Nakatani",
  title =        "Bytecode fetch optimization for a {Java} interpreter",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "30",
  number =       "5",
  pages =        "58--67",
  month =        dec,
  year =         "2002",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:23 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Li:2002:UIO,
  author =       "Tao Li and Lizy Kurian John and Anand Sivasubramaniam
                 and N. Vijaykrishnan and Juan Rubio",
  title =        "Understanding and improving operating system effects
                 in control flow prediction",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "30",
  number =       "5",
  pages =        "68--80",
  month =        dec,
  year =         "2002",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:23 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Levis:2002:MTV,
  author =       "Philip Levis and David Culler",
  title =        "{Mat{\'e}}: a tiny virtual machine for sensor
                 networks",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "30",
  number =       "5",
  pages =        "85--95",
  month =        dec,
  year =         "2002",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:23 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Juang:2002:EEC,
  author =       "Philo Juang and Hidekazu Oki and Yong Wang and
                 Margaret Martonosi and Li Shiuan Peh and Daniel
                 Rubenstein",
  title =        "Energy-efficient computing for wildlife tracking:
                 design tradeoffs and early experiences with
                 {ZebraNet}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "30",
  number =       "5",
  pages =        "96--107",
  month =        dec,
  year =         "2002",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:23 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kirovski:2002:ETS,
  author =       "Darko Kirovski and Milenko Drini{\'c} and Miodrag
                 Potkonjak",
  title =        "Enabling trusted software integrity",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "30",
  number =       "5",
  pages =        "108--120",
  month =        dec,
  year =         "2002",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:23 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Zeng:2002:EME,
  author =       "Heng Zeng and Carla S. Ellis and Alvin R. Lebeck and
                 Amin Vahdat",
  title =        "{ECOSystem}: managing energy as a first class
                 operating system resource",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "30",
  number =       "5",
  pages =        "123--132",
  month =        dec,
  year =         "2002",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:23 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ashok:2002:CMC,
  author =       "Raksit Ashok and Saurabh Chheda and Csaba Andras
                 Moritz",
  title =        "{Cool-Mem}: combining statically speculative memory
                 accessing with selective address translation for energy
                 efficiency",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "30",
  number =       "5",
  pages =        "133--143",
  month =        dec,
  year =         "2002",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:23 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Sasanka:2002:JLG,
  author =       "Ruchira Sasanka and Christopher J. Hughes and Sarita
                 V. Adve",
  title =        "Joint local and global hardware adaptations for
                 energy",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "30",
  number =       "5",
  pages =        "144--155",
  month =        dec,
  year =         "2002",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:23 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kim:2002:DEC,
  author =       "Dongkeun Kim and Donald Yeung",
  title =        "Design and evaluation of compiler algorithms for
                 pre-execution",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "30",
  number =       "5",
  pages =        "159--170",
  month =        dec,
  year =         "2002",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:23 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Zhai:2002:COS,
  author =       "Antonia Zhai and Christopher B. Colohan and J. Gregory
                 Steffan and Todd C. Mowry",
  title =        "Compiler optimization of scalar value communication
                 between speculative threads",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "30",
  number =       "5",
  pages =        "171--183",
  month =        dec,
  year =         "2002",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:23 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Oplinger:2002:ESR,
  author =       "Jeffrey Oplinger and Monica S. Lam",
  title =        "Enhancing software reliability with speculative
                 threads",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "30",
  number =       "5",
  pages =        "184--196",
  month =        dec,
  year =         "2002",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:23 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Butts:2002:DDI,
  author =       "J. Adam Butts and Guri Sohi",
  title =        "Dynamic dead-instruction detection and elimination",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "30",
  number =       "5",
  pages =        "199--210",
  month =        dec,
  year =         "2002",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:23 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kim:2002:ANU,
  author =       "Changkyu Kim and Doug Burger and Stephen W. Keckler",
  title =        "An adaptive, non-uniform cache structure for
                 wire-delay dominated on-chip caches",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "30",
  number =       "5",
  pages =        "211--222",
  month =        dec,
  year =         "2002",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:23 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Mukherjee:2002:CSA,
  author =       "Shubhendu S. Mukherjee and Federico Silla and Peter
                 Bannon and Joel Emer and Steve Lang and David Webb",
  title =        "A comparative study of arbitration algorithms for the
                 {Alpha 21364} pipelined router",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "30",
  number =       "5",
  pages =        "223--234",
  month =        dec,
  year =         "2002",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:23 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kim:2002:IWS,
  author =       "Hyong-youb Kim and Vijay S. Pai and Scott Rixner",
  title =        "Increasing {Web} server throughput with network
                 interface data caching",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "30",
  number =       "5",
  pages =        "239--250",
  month =        dec,
  year =         "2002",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:23 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kohler:2002:PLO,
  author =       "Eddie Kohler and Robert Morris and Benjie Chen",
  title =        "Programming language optimizations for modular router
                 configurations",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "30",
  number =       "5",
  pages =        "251--263",
  month =        dec,
  year =         "2002",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:23 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Sivathanu:2002:ERA,
  author =       "Muthian Sivathanu and Andrea C. Arpaci-Dusseau and
                 Remzi H. Arpaci-Dusseau",
  title =        "Evolving {RPC} for active storage",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "30",
  number =       "5",
  pages =        "264--276",
  month =        dec,
  year =         "2002",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:23 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Cooksey:2002:SCD,
  author =       "Robert Cooksey and Stephan Jourdan and Dirk Grunwald",
  title =        "A stateless, content-directed data prefetching
                 mechanism",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "30",
  number =       "5",
  pages =        "279--290",
  month =        dec,
  year =         "2002",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:23 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Gordon:2002:SCC,
  author =       "Michael I. Gordon and William Thies and Michal
                 Karczmarek and Jasper Lin and Ali S. Meli and Andrew A.
                 Lamb and Chris Leger and Jeremy Wong and Henry Hoffmann
                 and David Maze and Saman Amarasinghe",
  title =        "A stream compiler for communication-exposed
                 architectures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "30",
  number =       "5",
  pages =        "291--303",
  month =        dec,
  year =         "2002",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:23 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Witchel:2002:MMP,
  author =       "Emmett Witchel and Josh Cates and Krste Asanovi{\'c}",
  title =        "{Mondrian} memory protection",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "30",
  number =       "5",
  pages =        "304--316",
  month =        dec,
  year =         "2002",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:23 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Dennis:2003:FBM,
  author =       "Jack B. Dennis",
  title =        "Fresh {Breeze}: a multiprocessor chip architecture
                 guided by modular programming principles",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "31",
  number =       "1",
  pages =        "7--15",
  month =        mar,
  year =         "2003",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:37 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Morano:2003:RHI,
  author =       "D. Morano and A. Khalafi and D. R. Kaeli and A. K.
                 Uht",
  title =        "Realizing high {IPC} through a scalable memory-latency
                 tolerant multipath microarchitecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "31",
  number =       "1",
  pages =        "16--25",
  month =        mar,
  year =         "2003",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:37 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Almasi:2003:DCD,
  author =       "George Alm{\'a}si and C{\u{a}}lin Ca{\c{s}}caval and
                 Jos{\'e} G. Casta{\~n}os and Monty Denneau and Derek
                 Lieber and Jos{\'e} E. Moreira and Henry S. {Warren,
                 Jr.}",
  title =        "Dissecting {Cyclops}: a detailed analysis of a
                 multithreaded architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "31",
  number =       "1",
  pages =        "26--38",
  month =        mar,
  year =         "2003",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:37 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Zahran:2003:CMH,
  author =       "Mohamed M. Zahran",
  title =        "On cache memory hierarchy for {Chip-Multiprocessor}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "31",
  number =       "1",
  pages =        "39--48",
  month =        mar,
  year =         "2003",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:37 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Grewal:2003:EAC,
  author =       "Gary Gr{\'e}wal and Tom Wilson and Andrew Morton",
  title =        "An {EGA} approach to the compile-time assignment of
                 data to multiple memories in digital-signal
                 processors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "31",
  number =       "1",
  pages =        "49--59",
  month =        mar,
  year =         "2003",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:37 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ramacher:2003:GVP,
  author =       "Ulrich Ramacher and Nico Br{\"u}s and Ulrich Hachmann
                 and Jens Harnisch and Wolfgang Raab and Axel Techmer",
  title =        "{100 GOPS} vision processor for automotive
                 applications",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "31",
  number =       "1",
  pages =        "60--68",
  month =        mar,
  year =         "2003",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:37 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Pitsianis:2003:IVM,
  author =       "Nikos P. Pitsianis and Gerald G. Pechanek",
  title =        "Indirect {VLIW} memory allocation for the {ManArray}
                 multiprocessor {DSP}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "31",
  number =       "1",
  pages =        "69--74",
  month =        mar,
  year =         "2003",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:37 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Shimizu:2003:TLS,
  author =       "Naohiko Shimizu and Ken Takatori",
  title =        "A transparent {Linux} super page kernel for {Alpha},
                 {Sparc64} and {IA32}: reducing {TLB} misses of
                 applications",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "31",
  number =       "1",
  pages =        "75--84",
  month =        mar,
  year =         "2003",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:37 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Bechini:2003:FGD,
  author =       "Alessio Bechini and Pierfrancesco Foglia and Cosimo
                 Antonio Prete",
  title =        "Fine-grain design space exploration for a cartographic
                 {SoC} multiprocessor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "31",
  number =       "1",
  pages =        "85--92",
  month =        mar,
  year =         "2003",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:37 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Thorson:2003:INa,
  author =       "Mark Thorson",
  title =        "{Internet} Nuggets",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "31",
  number =       "1",
  pages =        "93--96",
  month =        mar,
  year =         "2003",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:37 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Skadron:2003:TAM,
  author =       "Kevin Skadron and Mircea R. Stan and Wei Huang and
                 Sivakumar Velusamy and Karthik Sankaranarayanan and
                 David Tarjan",
  title =        "Temperature-aware microarchitecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "31",
  number =       "2",
  pages =        "2--13",
  month =        may,
  year =         "2003",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:51 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Magklis:2003:PBD,
  author =       "Grigorios Magklis and Michael L. Scott and Greg
                 Semeraro and David H. Albonesi and Steven Dropsho",
  title =        "Profile-based dynamic voltage and frequency scaling
                 for a multiple clock domain microprocessor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "31",
  number =       "2",
  pages =        "14--27",
  month =        may,
  year =         "2003",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:51 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kim:2003:HPA,
  author =       "Ilhyun Kim and Mikko H. Lipasti",
  title =        "Half-price architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "31",
  number =       "2",
  pages =        "28--38",
  month =        may,
  year =         "2003",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:51 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Park:2003:IMP,
  author =       "Il Park and Babak Falsafi and T. N. Vijaykumar",
  title =        "Implicitly-multithreaded processors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "31",
  number =       "2",
  pages =        "39--51",
  month =        may,
  year =         "2003",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:51 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Citron:2003:MPM,
  author =       "Daniel Citron",
  title =        "{MisSPECulation}: partial and misleading use of {SPEC
                 CPU2000} in computer architecture conferences",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "31",
  number =       "2",
  pages =        "52--61",
  month =        may,
  year =         "2003",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:51 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Tseng:2003:BMR,
  author =       "Jessica H. Tseng and Krste Asanovi{\'c}",
  title =        "Banked multiported register files for high-frequency
                 superscalar microprocessors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "31",
  number =       "2",
  pages =        "62--71",
  month =        may,
  year =         "2003",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:51 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Powell:2003:PDM,
  author =       "Michael D. Powell and T. N. Vijaykumar",
  title =        "Pipeline damping: a microarchitectural technique to
                 reduce inductive noise in supply voltage",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "31",
  number =       "2",
  pages =        "72--83",
  month =        may,
  year =         "2003",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:51 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Wunderlich:2003:SAM,
  author =       "Roland E. Wunderlich and Thomas F. Wenisch and Babak
                 Falsafi and James C. Hoe",
  title =        "{SMARTS}: accelerating microarchitecture simulation
                 via rigorous statistical sampling",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "31",
  number =       "2",
  pages =        "84--97",
  month =        may,
  year =         "2003",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:51 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Gomaa:2003:TFR,
  author =       "Mohamed Gomaa and Chad Scarbrough and T. N. Vijaykumar
                 and Irith Pomeranz",
  title =        "Transient-fault recovery for chip multiprocessors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "31",
  number =       "2",
  pages =        "98--109",
  month =        may,
  year =         "2003",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:51 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Prvulovic:2003:RUT,
  author =       "Milos Prvulovic and Josep Torrellas",
  title =        "{ReEnact}: using thread-level speculation mechanisms
                 to debug data races in multithreaded codes",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "31",
  number =       "2",
  pages =        "110--121",
  month =        may,
  year =         "2003",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:51 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Xu:2003:FDR,
  author =       "Min Xu and Rastislav Bodik and Mark D. Hill",
  title =        "A ``flight data recorder'' for enabling full-system
                 multiprocessor deterministic replay",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "31",
  number =       "2",
  pages =        "122--135",
  month =        may,
  year =         "2003",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:51 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Zhang:2003:HCC,
  author =       "Chuanjun Zhang and Frank Vahid and Walid Najjar",
  title =        "A highly configurable cache architecture for embedded
                 systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "31",
  number =       "2",
  pages =        "136--146",
  month =        may,
  year =         "2003",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:51 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Buyuktosunoglu:2003:EEC,
  author =       "Alper Buyuktosuno{\u{g}}lu and Tejas Karkhanis and
                 David H. Albonesi and Pradip Bose",
  title =        "Energy efficient co-adaptive instruction fetch and
                 issue",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "31",
  number =       "2",
  pages =        "147--156",
  month =        may,
  year =         "2003",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:51 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Huang:2003:PAP,
  author =       "Michael C. Huang and Jose Renau and Josep Torrellas",
  title =        "Positional adaptation of processors: application to
                 energy reduction",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "31",
  number =       "2",
  pages =        "157--168",
  month =        may,
  year =         "2003",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:51 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Gurumurthi:2003:DDS,
  author =       "Sudhanva Gurumurthi and Anand Sivasubramaniam and
                 Mahmut Kandemir and Hubertus Franke",
  title =        "{DRPM}: dynamic speed control for power management in
                 server class disks",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "31",
  number =       "2",
  pages =        "169--181",
  month =        may,
  year =         "2003",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:51 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Martin:2003:TCD,
  author =       "Milo M. K. Martin and Mark D. Hill and David A. Wood",
  title =        "Token coherence: decoupling performance and
                 correctness",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "31",
  number =       "2",
  pages =        "182--193",
  month =        may,
  year =         "2003",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:51 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Singh:2003:GLB,
  author =       "Arjun Singh and William J. Dally and Amit K. Gupta and
                 Brian Towles",
  title =        "{GOAL}: a load-balanced adaptive routing algorithm for
                 torus networks",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "31",
  number =       "2",
  pages =        "194--205",
  month =        may,
  year =         "2003",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:51 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Martin:2003:UDS,
  author =       "Milo M. K. Martin and Pacia J. Harper and Daniel J.
                 Sorin and Mark D. Hill and David A. Wood",
  title =        "Using destination-set prediction to improve the
                 latency\slash bandwidth tradeoff in shared-memory
                 multiprocessors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "31",
  number =       "2",
  pages =        "206--217",
  month =        may,
  year =         "2003",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:51 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Cvetanovic:2003:PAA,
  author =       "Zarka Cvetanovic",
  title =        "Performance analysis of the {Alpha 21364}-based {HP
                 GS1280} multiprocessor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "31",
  number =       "2",
  pages =        "218--229",
  month =        may,
  year =         "2003",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:51 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Oberoi:2003:PFE,
  author =       "Paramjit S. Oberoi and Gurindar S. Sohi",
  title =        "Parallelism in the front-end",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "31",
  number =       "2",
  pages =        "230--240",
  month =        may,
  year =         "2003",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:51 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Seznec:2003:EAP,
  author =       "Andr{\'e} Seznec and Antony Fraboulet",
  title =        "Effective ahead pipelining of instruction block
                 address generation",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "31",
  number =       "2",
  pages =        "241--252",
  month =        may,
  year =         "2003",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:51 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ernst:2003:CBF,
  author =       "Dan Ernst and Andrew Hamel and Todd Austin",
  title =        "{Cyclone}: a broadcast-free dynamic instruction
                 scheduler with selective replay",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "31",
  number =       "2",
  pages =        "253--263",
  month =        may,
  year =         "2003",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:51 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Bhargava:2003:IDC,
  author =       "Ravi Bhargava and Lizy K. John",
  title =        "Improving dynamic cluster assignment for clustered
                 trace cache processors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "31",
  number =       "2",
  pages =        "264--274",
  month =        may,
  year =         "2003",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:51 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Balasubramonian:2003:DMC,
  author =       "Rajeev Balasubramonian and Sandhya Dwarkadas and David
                 H. Albonesi",
  title =        "Dynamically managing the communication-parallelism
                 trade-off in future clustered processors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "31",
  number =       "2",
  pages =        "275--287",
  month =        may,
  year =         "2003",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:51 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Sherwood:2003:PMA,
  author =       "Timothy Sherwood and George Varghese and Brad Calder",
  title =        "A pipelined memory architecture for high throughput
                 network processors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "31",
  number =       "2",
  pages =        "288--299",
  month =        may,
  year =         "2003",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:51 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hasan:2003:EUM,
  author =       "Jahangir Hasan and Satish Chandra and T. N.
                 Vijaykumar",
  title =        "Efficient use of memory bandwidth to improve network
                 processor throughput",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "31",
  number =       "2",
  pages =        "300--313",
  month =        may,
  year =         "2003",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:51 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Thomas:2003:IBP,
  author =       "Renju Thomas and Manoj Franklin and Chris Wilkerson
                 and Jared Stark",
  title =        "Improving branch prediction by dynamic dataflow-based
                 identification of correlated branches from a large
                 global history",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "31",
  number =       "2",
  pages =        "314--323",
  month =        may,
  year =         "2003",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:51 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Zhou:2003:DGS,
  author =       "Huiyang Zhou and Jill Flanagan and Thomas M. Conte",
  title =        "Detecting global stride locality in value streams",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "31",
  number =       "2",
  pages =        "324--335",
  month =        may,
  year =         "2003",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:51 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Sherwood:2003:PTP,
  author =       "Timothy Sherwood and Suleyman Sair and Brad Calder",
  title =        "Phase tracking and prediction",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "31",
  number =       "2",
  pages =        "336--349",
  month =        may,
  year =         "2003",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:51 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Anantaraman:2003:VSA,
  author =       "Aravindh Anantaraman and Kiran Seth and Kaustubh Patil
                 and Eric Rotenberg and Frank Mueller",
  title =        "Virtual simple architecture {(VISA)}: exceeding the
                 complexity limit in safe real-time systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "31",
  number =       "2",
  pages =        "350--361",
  month =        may,
  year =         "2003",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:51 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Corliss:2003:DPM,
  author =       "Marc L. Corliss and E. Christopher Lewis and Amir
                 Roth",
  title =        "{DISE}: a programmable macro engine for customizing
                 applications",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "31",
  number =       "2",
  pages =        "362--373",
  month =        may,
  year =         "2003",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:51 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Oskin:2003:BQW,
  author =       "Mark Oskin and Frederic T. Chong and Isaac L. Chuang
                 and John Kubiatowicz",
  title =        "Building quantum wires: the long and the short of it",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "31",
  number =       "2",
  pages =        "374--387",
  month =        may,
  year =         "2003",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:51 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Wang:2003:GRP,
  author =       "Zhenlin Wang and Doug Burger and Kathryn S. McKinley
                 and Steven K. Reinhardt and Charles C. Weems",
  title =        "Guided region prefetching: a cooperative
                 hardware\slash software approach",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "31",
  number =       "2",
  pages =        "388--398",
  month =        may,
  year =         "2003",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:51 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kozyrakis:2003:OLC,
  author =       "Christos Kozyrakis and David Patterson",
  title =        "Overcoming the limitations of conventional vector
                 processors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "31",
  number =       "2",
  pages =        "399--409",
  month =        may,
  year =         "2003",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:51 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Suh:2003:PAP,
  author =       "Jinwoo Suh and Eun-Gyu Kim and Stephen P. Crago and
                 Lakshmi Srinivasan and Matthew C. French",
  title =        "A performance analysis of {PIM}, stream processing,
                 and tiled processing on memory-intensive signal
                 processing kernels",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "31",
  number =       "2",
  pages =        "410--421",
  month =        may,
  year =         "2003",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:51 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Sankaralingam:2003:EIT,
  author =       "Karthikeyan Sankaralingam and Ramadass Nagarajan and
                 Haiming Liu and Changkyu Kim and Jaehyuk Huh and Doug
                 Burger and Stephen W. Keckler and Charles R. Moore",
  title =        "Exploiting {ILP}, {TLP}, and {DLP} with the
                 polymorphous {TRIPS} architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "31",
  number =       "2",
  pages =        "422--433",
  month =        may,
  year =         "2003",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:51 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Chen:2003:JSD,
  author =       "Michael K. Chen and Kunle Olukotun",
  title =        "The {Jrpm} system for dynamically parallelizing {Java}
                 programs",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "31",
  number =       "2",
  pages =        "434--446",
  month =        may,
  year =         "2003",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:51 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Fong:2003:CAA,
  author =       "Anthony S. Fong",
  title =        "A computer architecture with access control and cache
                 option tags on individual instruction operands",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "31",
  number =       "3",
  pages =        "1--5",
  month =        jun,
  year =         "2003",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:00 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Tan:2003:DAP,
  author =       "Edwin J. Tan and Wendi B. Heinzelman",
  title =        "{DSP} architectures: past, present and futures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "31",
  number =       "3",
  pages =        "6--19",
  month =        jun,
  year =         "2003",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:00 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Vintan:2003:ABP,
  author =       "Lucian N. Vintan and Marius Sbera and Ioan Z. Mihu and
                 Adrian Florea",
  title =        "An alternative to branch prediction: pre-computed
                 branches",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "31",
  number =       "3",
  pages =        "20--29",
  month =        jun,
  year =         "2003",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:00 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Heinrich:2003:OWA,
  author =       "Mark Heinrich and Mainak Chaudhuri",
  title =        "Ocean warning: avoid drowning",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "31",
  number =       "3",
  pages =        "30--32",
  month =        jun,
  year =         "2003",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:00 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Lafitte:2003:QMC,
  author =       "Jean-Louis Lafitte",
  title =        "Qualitatively matching computer architecture with
                 {Turing} machine",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "31",
  number =       "3",
  pages =        "33--41",
  month =        jun,
  year =         "2003",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:00 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Koushiro:2003:TLV,
  author =       "Takenori Koushiro and Toshinori Sato and Itsujiro
                 Arita",
  title =        "A trace-level value predictor for {Contrail}
                 processors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "31",
  number =       "3",
  pages =        "42--47",
  month =        jun,
  year =         "2003",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:00 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Thorson:2003:INb,
  author =       "Mark Thorson",
  title =        "{Internet} Nuggets",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "31",
  number =       "3",
  pages =        "48--54",
  month =        jun,
  year =         "2003",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:00 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Thorup:2003:CPM,
  author =       "Mikkel Thorup",
  title =        "Combinatorial power in multimedia processors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "31",
  number =       "4",
  pages =        "5--11",
  month =        sep,
  year =         "2003",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:15 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hau:2003:SJA,
  author =       "Gary K. W. Hau and Anthony Fong and Mok Pak Lun",
  title =        "Support of {Java API} for the {jHISC} system",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "31",
  number =       "4",
  pages =        "12--17",
  month =        sep,
  year =         "2003",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:15 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Lun:2003:MMO,
  author =       "Mok Pak Lun and Richard Li and Anthony Fong",
  title =        "Method manipulation in an object-oriented processor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "31",
  number =       "4",
  pages =        "18--25",
  month =        sep,
  year =         "2003",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:15 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Thorson:2003:INc,
  author =       "Mark Thorson",
  title =        "{Internet} Nuggets",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "31",
  number =       "4",
  pages =        "26--32",
  month =        sep,
  year =         "2003",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:15 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Breen:2003:AAA,
  author =       "Kristopher C. Breen and Duncan G. Elliott",
  title =        "Aliasing and anti-aliasing in branch history table
                 prediction",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "31",
  number =       "5",
  pages =        "1--4",
  month =        dec,
  year =         "2003",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:23 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Yu:2003:TBS,
  author =       "Ryan W. S. Yu and Gary K. W. Hau and Anthony S. Fong",
  title =        "Test bench for software development of object-oriented
                 processor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "31",
  number =       "5",
  pages =        "5--9",
  month =        dec,
  year =         "2003",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:23 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Lun:2003:OOP,
  author =       "Mok Pak Lun and Anthony Fong and Gary K. W. Hau",
  title =        "Object-oriented processor requirements with
                 instruction analysis of {Java} programs",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "31",
  number =       "5",
  pages =        "10--15",
  month =        dec,
  year =         "2003",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:23 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Thorson:2003:INd,
  author =       "Mark Thorson",
  title =        "{Internet} Nuggets",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "31",
  number =       "5",
  pages =        "16--21",
  month =        dec,
  year =         "2003",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:23 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{John:2004:MFS,
  author =       "Lizy Kurian John",
  title =        "More on finding a single number to indicate overall
                 performance of a benchmark suite",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "32",
  number =       "1",
  pages =        "3--8",
  month =        mar,
  year =         "2004",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:37 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Thorson:2004:INa,
  author =       "Mark Thorson",
  title =        "{Internet} Nuggets",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "32",
  number =       "1",
  pages =        "9--13",
  month =        mar,
  year =         "2004",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:37 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Taylor:2004:ERM,
  author =       "Michael Bedford Taylor and Walter Lee and Jason Miller
                 and David Wentzlaff and Ian Bratt and Ben Greenwald and
                 Henry Hoffmann and Paul Johnson and Jason Kim and James
                 Psota and Arvind Saraf and Nathan Shnidman and Volker
                 Strumpen and Matt Frank and Saman Amarasinghe and Anant
                 Agarwal",
  title =        "Evaluation of the Raw Microprocessor: An
                 Exposed-Wire-Delay Architecture for {ILP} and
                 {Streams}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "32",
  number =       "2",
  pages =        "2--2",
  month =        mar,
  year =         "2004",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:45 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Anonymous:2004:GCC,
  author =       "Anonymous",
  title =        "General {Co-Chair}'s Message",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "32",
  number =       "2",
  pages =        "9--9",
  month =        mar,
  year =         "2004",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:45 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Anonymous:2004:PCM,
  author =       "Anonymous",
  title =        "Program {Chair}'s Message",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "32",
  number =       "2",
  pages =        "10--10",
  month =        mar,
  year =         "2004",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:45 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Anonymous:2004:C,
  author =       "Anonymous",
  title =        "Committees",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "32",
  number =       "2",
  pages =        "11--11",
  month =        mar,
  year =         "2004",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:45 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Anonymous:2004:Ra,
  author =       "Anonymous",
  title =        "Reviewers",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "32",
  number =       "2",
  pages =        "13--13",
  month =        mar,
  year =         "2004",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:45 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ahn:2004:EIS,
  author =       "Jung Ho Ahn and William J. Dally and Brucek Khailany
                 and Ujval J. Kapasi and Abhishek Das",
  title =        "Evaluating the {Imagine Stream Architecture}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "32",
  number =       "2",
  pages =        "14--14",
  month =        mar,
  year =         "2004",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:45 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Sias:2004:FTI,
  author =       "John W. Sias and Sain-zee Ueng and Geoff A. Kent and
                 Ian M. Steiner and Erik M. Nystrom and Wen-mei W. Hwu",
  title =        "Field-testing {IMPACT EPIC} research results in
                 {Itanium 2}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "32",
  number =       "2",
  pages =        "26--26",
  month =        mar,
  year =         "2004",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:45 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Vijaykumar:2004:WDP,
  author =       "T. N. Vijaykumar and Zeshan Chishti",
  title =        "Wire Delay is Not a Problem for {SMT} (In the Near
                 Future)",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "32",
  number =       "2",
  pages =        "40--40",
  month =        mar,
  year =         "2004",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:45 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Krashinsky:2004:VTA,
  author =       "Ronny Krashinsky and Christopher Batten and Mark
                 Hampton and Steve Gerding and Brian Pharris and Jared
                 Casper and Krste Asanovic",
  title =        "The Vector-Thread Architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "32",
  number =       "2",
  pages =        "52--52",
  month =        mar,
  year =         "2004",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:45 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kumar:2004:SIH,
  author =       "Rakesh Kumar and Dean M. Tullsen and Parthasarathy
                 Ranganathan and Norman P. Jouppi and Keith I. Farkas",
  title =        "Single-{ISA} Heterogeneous Multi-Core Architectures
                 for Multithreaded Workload Performance",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "32",
  number =       "2",
  pages =        "64--64",
  month =        mar,
  year =         "2004",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:45 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Chou:2004:MOE,
  author =       "Yuan Chou and Brian Fahs and Santosh Abraham",
  title =        "Microarchitecture Optimizations for Exploiting
                 Memory-Level Parallelism",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "32",
  number =       "2",
  pages =        "76--76",
  month =        mar,
  year =         "2004",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:45 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Cain:2004:MOV,
  author =       "Harold W. Cain and Mikko H. Lipasti",
  title =        "Memory Ordering: a Value-Based Approach",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "32",
  number =       "2",
  pages =        "90--90",
  month =        mar,
  year =         "2004",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:45 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hammond:2004:TMC,
  author =       "Lance Hammond and Vicky Wong and Mike Chen and Brian
                 D. Carlstrom and John D. Davis and Ben Hertzberg and
                 Manohar K. Prabhu and Honggo Wijaya and Christos
                 Kozyrakis and Kunle Olukotun",
  title =        "Transactional Memory Coherence and Consistency",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "32",
  number =       "2",
  pages =        "102--102",
  month =        mar,
  year =         "2004",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:45 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hangal:2004:TPV,
  author =       "Sudheendra Hangal and Durgam Vahia and Chaiyasit
                 Manovit and Juin-Yeu Joseph Lu",
  title =        "{TSOtool}: a Program for Verifying Memory Systems
                 Using the Memory Consistency Model",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "32",
  number =       "2",
  pages =        "114--114",
  month =        mar,
  year =         "2004",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:45 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Chaudhuri:2004:SAN,
  author =       "Mainak Chaudhuri and Mark Heinrich",
  title =        "{SMTp}: {An Architecture} for {Next-generation
                 Scalable Multi-threading}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "32",
  number =       "2",
  pages =        "124--124",
  month =        mar,
  year =         "2004",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:45 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hughes:2004:FAF,
  author =       "Christopher J. Hughes and Sarita V. Adve",
  title =        "A {Formal Approach} to {Frequent Energy Adaptations}
                 for {Multimedia Applications}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "32",
  number =       "2",
  pages =        "138--138",
  month =        mar,
  year =         "2004",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:45 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Oliver:2004:SMC,
  author =       "John Oliver and Ravishankar Rao and Paul Sultana and
                 Jedidiah Crandall and Erik Czernikowski and Leslie W.
                 {Jones IV} and Diana Franklin and Venkatesh Akella and
                 Frederic T. Chong",
  title =        "{Synchroscalar}: a Multiple Clock Domain, Power-Aware,
                 Tile-Based Embedded Processor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "32",
  number =       "2",
  pages =        "150--150",
  month =        mar,
  year =         "2004",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:45 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Rosner:2004:PAT,
  author =       "Roni Rosner and Yoav Almog and Micha Moffie and
                 Naftali Schwartz and Avi Mendelson",
  title =        "Power Awareness through Selective Dynamically
                 Optimized Traces",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "32",
  number =       "2",
  pages =        "162--162",
  month =        mar,
  year =         "2004",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:45 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Bairavasundaram:2004:XRN,
  author =       "Lakshmi N. Bairavasundaram and Muthian Sivathanu and
                 Andrea C. Arpaci-Dusseau and Remzi H. Arpaci-Dusseau",
  title =        "{X-RAY}: a Non-Invasive Exclusive Caching Mechanism
                 for {RAIDs}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "32",
  number =       "2",
  pages =        "176--176",
  month =        mar,
  year =         "2004",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:45 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Mullins:2004:LLV,
  author =       "Robert Mullins and Andrew West and Simon Moore",
  title =        "Low-Latency Virtual-Channel Routers for On-Chip
                 Networks",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "32",
  number =       "2",
  pages =        "188--188",
  month =        mar,
  year =         "2004",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:45 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Puente:2004:ICR,
  author =       "V. Puente and J. A. Gregorio and F. Vallejo and R.
                 Beivide",
  title =        "{Immunet}: a Cheap and Robust Fault-Tolerant Packet
                 Routing Mechanism",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "32",
  number =       "2",
  pages =        "198--198",
  month =        mar,
  year =         "2004",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:45 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Alameldeen:2004:ACC,
  author =       "Alaa R. Alameldeen and David A. Wood",
  title =        "Adaptive Cache Compression for High-Performance
                 Processors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "32",
  number =       "2",
  pages =        "212--212",
  month =        mar,
  year =         "2004",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:45 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Zhou:2004:IEA,
  author =       "Pin Zhou and Feng Qin and Wei Liu and Yuanyuan Zhou
                 and Josep Torrellas",
  title =        "{iWatcher}: Efficient Architectural Support for
                 Software Debugging",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "32",
  number =       "2",
  pages =        "224--224",
  month =        mar,
  year =         "2004",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:45 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Yehia:2004:SDI,
  author =       "Sami Yehia and Olivier Temam",
  title =        "From Sequences of Dependent Instructions to Functions:
                 An Approach for Improving Performance without {ILP} or
                 Speculation",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "32",
  number =       "2",
  pages =        "238--238",
  month =        mar,
  year =         "2004",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:45 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Falcon:2004:PCH,
  author =       "Ayose Falcon and Jared Stark and Alex Ramirez and
                 Konrad Lai and Mateo Valero",
  title =        "Prophet\slash Critic Hybrid Branch Prediction",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "32",
  number =       "2",
  pages =        "250--250",
  month =        mar,
  year =         "2004",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:45 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Weaver:2004:TRS,
  author =       "Christopher Weaver and Joel Emer and Shubhendu S.
                 Mukherjee and Steven K. Reinhardt",
  title =        "Techniques to Reduce the Soft Error Rate of a
                 High-Performance Microprocessor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "32",
  number =       "2",
  pages =        "264--264",
  month =        mar,
  year =         "2004",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:45 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Srinivasan:2004:CLR,
  author =       "Jayanth Srinivasan and Sarita V. Adve and Pradip Bose
                 and Jude A. Rivers",
  title =        "The Case for Lifetime Reliability-Aware
                 Microprocessors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "32",
  number =       "2",
  pages =        "276--276",
  month =        mar,
  year =         "2004",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:45 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Powell:2004:ERB,
  author =       "Michael D. Powell and T. N. Vijaykumar",
  title =        "Exploiting Resonant Behavior to Reduce Inductive
                 Noise",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "32",
  number =       "2",
  pages =        "288--288",
  month =        mar,
  year =         "2004",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:45 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Butts:2004:UBR,
  author =       "J. Adam Butts and Gurindar S. Sohi",
  title =        "Use-Based Register Caching with Decoupled Indexing",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "32",
  number =       "2",
  pages =        "302--302",
  month =        mar,
  year =         "2004",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:45 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Gonzalez:2004:CAI,
  author =       "Gonzalez Gonzalez and Adrian Cristal and Daniel Ortega
                 and Alexander Veidenbaum and Mateo Valero",
  title =        "A Content Aware Integer Register File Organization",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "32",
  number =       "2",
  pages =        "314--314",
  month =        mar,
  year =         "2004",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:45 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Lipasti:2004:PRI,
  author =       "Mikko H. Lipasti and Brian R. Mestan and Erika
                 Gunadi",
  title =        "Physical Register Inlining",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "32",
  number =       "2",
  pages =        "325--325",
  month =        mar,
  year =         "2004",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:45 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Karkhanis:2004:FOS,
  author =       "Tejas S. Karkhanis and James E. Smith",
  title =        "A First-Order Superscalar Processor Model",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "32",
  number =       "2",
  pages =        "338--338",
  month =        mar,
  year =         "2004",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:45 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Eeckhout:2004:CFM,
  author =       "Lieven Eeckhout and Robert H. {Bell Jr.} and Bastiaan
                 Stougie and Koen {De Bosschere} and Lizy K. John",
  title =        "Control Flow Modeling in Statistical Simulation for
                 Accurate and Efficient Processor Design Studies",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "32",
  number =       "2",
  pages =        "350--350",
  month =        mar,
  year =         "2004",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:45 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Iyer:2004:ESI,
  author =       "Bharath Iyer and Sadagopan Srinivasan and Bruce
                 Jacob",
  title =        "Extended Split-Issue: Enabling Flexibility in the
                 Hardware Implementation of {NUAL VLIW DSPs}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "32",
  number =       "2",
  pages =        "364--364",
  month =        mar,
  year =         "2004",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:45 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Parashar:2004:CEA,
  author =       "Angshuman Parashar and Sudhanva Gurumurthi and Anand
                 Sivasubramaniam",
  title =        "A Complexity-Effective Approach to {ALU} Bandwidth
                 Enhancement for Instruction-Level Temporal Redundancy",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "32",
  number =       "2",
  pages =        "376--376",
  month =        mar,
  year =         "2004",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:45 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Anonymous:2004:AI,
  author =       "Anonymous",
  title =        "Author Index",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "32",
  number =       "2",
  pages =        "387--387",
  month =        mar,
  year =         "2004",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:45 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Cristal:2004:CRC,
  author =       "Adri{\'a}n Cristal and Jos{\'e} F. Mart{\'\i}nez and
                 Josep Llosa and Mateo Valero",
  title =        "A case for resource-conscious out-of-order processors:
                 towards kilo-instruction in-flight processors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "32",
  number =       "3",
  pages =        "3--10",
  month =        jun,
  year =         "2004",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:01 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kundu:2004:CSI,
  author =       "Partha Kundu and Murali Annavaram and Trung Diep and
                 John Shen",
  title =        "A case for shared instruction cache on chip
                 multiprocessors running {OLTP}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "32",
  number =       "3",
  pages =        "11--18",
  month =        jun,
  year =         "2004",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:01 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Venkateswaran:2004:MPN,
  author =       "N. Venkateswaran and Waran Research Foundation and
                 Aditya Krishnan and S. Niranjan Kumar and Arrvindh
                 Shriraman and Srinivas Sridharan",
  title =        "Memory in processor: a novel design paradigm for
                 supercomputing architectures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "32",
  number =       "3",
  pages =        "19--26",
  month =        jun,
  year =         "2004",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:01 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Branovic:2004:WCE,
  author =       "I. Branovic and R. Giorgi and E. Martinelli",
  title =        "A workload characterization of elliptic curve
                 cryptography methods in embedded environments",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "32",
  number =       "3",
  pages =        "27--34",
  month =        jun,
  year =         "2004",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:01 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Brifault:2004:DCM,
  author =       "K. Brifault and H. P. Charles",
  title =        "Data cache management on {EPIC} architecture:
                 optimizing memory access for image processing",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "32",
  number =       "3",
  pages =        "35--42",
  month =        jun,
  year =         "2004",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:01 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Shimizu:2004:JOL,
  author =       "Naohiko Shimizu and Chiaki Kon",
  title =        "{Java} object look aside buffer for embedded
                 applications",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "32",
  number =       "3",
  pages =        "43--49",
  month =        jun,
  year =         "2004",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:01 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Sakanaka:2004:LER,
  author =       "Akihito Sakanaka and Seiichirou Fujii and Toshinori
                 Sato",
  title =        "A leakage-energy-reduction technique for
                 highly-associative caches in embedded systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "32",
  number =       "3",
  pages =        "50--54",
  month =        jun,
  year =         "2004",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:01 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Moch:2004:HSM,
  author =       "S. Moch and M. Berekovi{\'c} and H. J. Stolberg and L.
                 Friebe and M. B. Kulaczewski and A. Dehnhardt and P.
                 Pirsch",
  title =        "{HIBRID-SOC}: a multi-core architecture for image and
                 video applications",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "32",
  number =       "3",
  pages =        "55--61",
  month =        jun,
  year =         "2004",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:01 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Berekovic:2004:SCS,
  author =       "Mladen Berekovic and S{\"o}ren Moch and Peter Pirsch",
  title =        "A scalable, clustered {SMT} processor for digital
                 signal processing",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "32",
  number =       "3",
  pages =        "62--69",
  month =        jun,
  year =         "2004",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:01 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Bartolini:2004:PIS,
  author =       "S. Bartolini and C. A. Prete",
  title =        "A proposal for input-sensitivity analysis of
                 profile-driven optimizations on embedded applications",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "32",
  number =       "3",
  pages =        "70--77",
  month =        jun,
  year =         "2004",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:01 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Thorson:2004:INb,
  author =       "Mark Thorson",
  title =        "{Internet} Nuggets",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "32",
  number =       "3",
  pages =        "78--83",
  month =        jun,
  year =         "2004",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:01 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Mashey:2004:WBM,
  author =       "John R. Mashey",
  title =        "War of the benchmark means: time for a truce",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "32",
  number =       "4",
  pages =        "1--14",
  month =        sep,
  year =         "2004",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:16 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Lafitte:2004:YLL,
  author =       "Jean-Louis Lafitte",
  title =        "40 years later \ldots{} a new engine to handle an
                 operating system infrastructure",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "32",
  number =       "4",
  pages =        "15--22",
  month =        sep,
  year =         "2004",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:16 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Thorson:2004:INc,
  author =       "Mark Thorson",
  title =        "{Internet} Nuggets",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "32",
  number =       "4",
  pages =        "23--41",
  month =        sep,
  year =         "2004",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:16 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hammond:2004:PTC,
  author =       "Lance Hammond and Brian D. Carlstrom and Vicky Wong
                 and Ben Hertzberg and Mike Chen and Christos Kozyrakis
                 and Kunle Olukotun",
  title =        "Programming with transactional coherence and
                 consistency {(TCC)}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "32",
  number =       "5",
  pages =        "1--13",
  month =        dec,
  year =         "2004",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:24 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Budiu:2004:SC,
  author =       "Mihai Budiu and Girish Venkataramani and Tiberiu
                 Chelcea and Seth Copen Goldstein",
  title =        "Spatial computation",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "32",
  number =       "5",
  pages =        "14--26",
  month =        dec,
  year =         "2004",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:24 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ekanayake:2004:ULP,
  author =       "Virantha Ekanayake and Clinton {Kelly IV} and Rajit
                 Manohar",
  title =        "An ultra low-power processor for sensor networks",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "32",
  number =       "5",
  pages =        "27--36",
  month =        dec,
  year =         "2004",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:24 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Lumb:2004:DSD,
  author =       "Christopher R. Lumb and Richard Golding",
  title =        "{D-SPTF}: decentralized request distribution in
                 brick-based storage systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "32",
  number =       "5",
  pages =        "37--47",
  month =        dec,
  year =         "2004",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:24 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Saito:2004:FBD,
  author =       "Yasushi Saito and Svend Fr{\o}lund and Alistair Veitch
                 and Arif Merchant and Susan Spence",
  title =        "{FAB}: building distributed enterprise disk arrays
                 from commodity components",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "32",
  number =       "5",
  pages =        "48--58",
  month =        dec,
  year =         "2004",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:24 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Denehy:2004:DSA,
  author =       "Timothy E. Denehy and John Bent and Florentina I.
                 Popovici and Andrea C. Arpaci-Dusseau and Remzi H.
                 Arpaci-Dusseau",
  title =        "Deconstructing storage arrays",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "32",
  number =       "5",
  pages =        "59--71",
  month =        dec,
  year =         "2004",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:24 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Zhuang:2004:HIE,
  author =       "Xiaotong Zhuang and Tao Zhang and Santosh Pande",
  title =        "{HIDE}: an infrastructure for efficiently protecting
                 information leakage on the address bus",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "32",
  number =       "5",
  pages =        "72--84",
  month =        dec,
  year =         "2004",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:24 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Suh:2004:SPE,
  author =       "G. Edward Suh and Jae W. Lee and David Zhang and
                 Srinivas Devadas",
  title =        "Secure program execution via dynamic information flow
                 tracking",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "32",
  number =       "5",
  pages =        "85--96",
  month =        dec,
  year =         "2004",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:24 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Huh:2004:CDM,
  author =       "Jaehyuk Huh and Jichuan Chang and Doug Burger and
                 Gurindar S. Sohi",
  title =        "Coherence decoupling: making use of incoherence",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "32",
  number =       "5",
  pages =        "97--106",
  month =        dec,
  year =         "2004",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:24 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Srinivasan:2004:CFP,
  author =       "Srikanth T. Srinivasan and Ravi Rajwar and Haitham
                 Akkary and Amit Gandhi and Mike Upton",
  title =        "Continual flow pipelines",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "32",
  number =       "5",
  pages =        "107--119",
  month =        dec,
  year =         "2004",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:24 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Desikan:2004:SSR,
  author =       "Rajagopalan Desikan and Simha Sethumadhavan and Doug
                 Burger and Stephen W. Keckler",
  title =        "Scalable selective re-execution for {EDGE}
                 architectures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "32",
  number =       "5",
  pages =        "120--132",
  month =        dec,
  year =         "2004",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:24 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Regehr:2004:HSA,
  author =       "John Regehr and Alastair Reid",
  title =        "{HOIST}: a system for automatically deriving static
                 analyzers for embedded systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "32",
  number =       "5",
  pages =        "133--143",
  month =        dec,
  year =         "2004",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:24 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Wang:2004:HTV,
  author =       "Perry H. Wang and Jamison D. Collins and Hong Wang and
                 Dongkeun Kim and Bill Greene and Kai-Ming Chan and
                 Aamir B. Yunus and Terry Sych and Stephen F. Moore and
                 John P. Shen",
  title =        "Helper threads via virtual multithreading on an
                 experimental {Itanium-2} processor-based platform",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "32",
  number =       "5",
  pages =        "144--155",
  month =        dec,
  year =         "2004",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:24 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hauswirth:2004:LOM,
  author =       "Matthias Hauswirth and Trishul M. Chilimbi",
  title =        "Low-overhead memory leak detection using adaptive
                 statistical profiling",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "32",
  number =       "5",
  pages =        "156--164",
  month =        dec,
  year =         "2004",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:24 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Shen:2004:LPP,
  author =       "Xipeng Shen and Yutao Zhong and Chen Ding",
  title =        "Locality phase prediction",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "32",
  number =       "5",
  pages =        "165--176",
  month =        dec,
  year =         "2004",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:24 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Zhou:2004:DTP,
  author =       "Pin Zhou and Vivek Pandey and Jagadeesan Sundaresan
                 and Anand Raghuraman and Yuanyuan Zhou and Sanjeev
                 Kumar",
  title =        "Dynamic tracking of page miss ratio curve for memory
                 management",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "32",
  number =       "5",
  pages =        "177--188",
  month =        dec,
  year =         "2004",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:24 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Rabbah:2004:COP,
  author =       "Rodric M. Rabbah and Hariharan Sandanagobalane and
                 Mongkol Ekpanyapong and Weng-Fai Wong",
  title =        "Compiler orchestrated prefetching via speculation and
                 predication",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "32",
  number =       "5",
  pages =        "189--198",
  month =        dec,
  year =         "2004",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:24 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Cher:2004:SPM,
  author =       "Chen-Yong Cher and Antony L. Hosking and T. N.
                 Vijaykumar",
  title =        "Software prefetching for mark-sweep garbage
                 collection: hardware analysis and software redesign",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "32",
  number =       "5",
  pages =        "199--210",
  month =        dec,
  year =         "2004",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:24 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Lowell:2004:DVM,
  author =       "David E. Lowell and Yasushi Saito and Eileen J.
                 Samberg",
  title =        "Devirtualizable virtual machines enabling general,
                 single-node, online maintenance",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "32",
  number =       "5",
  pages =        "211--223",
  month =        dec,
  year =         "2004",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:24 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Smolens:2004:FBS,
  author =       "Jared C. Smolens and Brian T. Gold and Jangwoo Kim and
                 Babak Falsafi and James C. Hoe and Andreas G.
                 Nowatzyk",
  title =        "Fingerprinting: bounding soft-error detection latency
                 and bandwidth",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "32",
  number =       "5",
  pages =        "224--234",
  month =        dec,
  year =         "2004",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:24 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Bronevetsky:2004:ALC,
  author =       "Greg Bronevetsky and Daniel Marques and Keshav Pingali
                 and Peter Szwed and Martin Schulz",
  title =        "Application-level checkpointing for shared memory
                 programs",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "32",
  number =       "5",
  pages =        "235--247",
  month =        dec,
  year =         "2004",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:24 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Wu:2004:FOM,
  author =       "Qiang Wu and Philo Juang and Margaret Martonosi and
                 Douglas W. Clark",
  title =        "Formal online methods for voltage\slash frequency
                 control in multiple clock domain microprocessors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "32",
  number =       "5",
  pages =        "248--259",
  month =        dec,
  year =         "2004",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:24 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Gomaa:2004:HRL,
  author =       "Mohamed Gomaa and Michael D. Powell and T. N.
                 Vijaykumar",
  title =        "Heat-and-run: leveraging {SMT} and {CMP} to manage
                 power density through the operating system",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "32",
  number =       "5",
  pages =        "260--270",
  month =        dec,
  year =         "2004",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:24 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Li:2004:PDE,
  author =       "Xiaodong Li and Zhenmin Li and Francis David and Pin
                 Zhou and Yuanyuan Zhou and Sarita Adve and Sanjeev
                 Kumar",
  title =        "Performance directed energy management for main memory
                 and disks",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "32",
  number =       "5",
  pages =        "271--283",
  month =        dec,
  year =         "2004",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:24 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Chess:2005:SAC,
  author =       "David M. Chess",
  title =        "Security in autonomic computing",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "1",
  pages =        "2--5",
  month =        mar,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:37 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "Workshop on Architectural Support for Security and
                 Anti-Virus (WASSA)",
}

@Article{Shi:2005:TIA,
  author =       "Weidong Shi and Hsien-Hsin S. Lee and Chenghuai Lu and
                 Mrinmoy Ghosh",
  title =        "Towards the issues in architectural support for
                 protection of software execution",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "1",
  pages =        "6--15",
  month =        mar,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:37 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "Workshop on Architectural Support for Security and
                 Anti-Virus (WASSA)",
}

@Article{McGregor:2005:PCK,
  author =       "John P. McGregor and Ruby B. Lee",
  title =        "Protecting cryptographic keys and computations via
                 virtual secure coprocessing",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "1",
  pages =        "16--26",
  month =        mar,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:37 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "Workshop on Architectural Support for Security and
                 Anti-Virus (WASSA)",
}

@Article{Rogers:2005:MPH,
  author =       "Brian Rogers and Yan Solihin and Milos Prvulovic",
  title =        "Memory predecryption: hiding the latency overhead of
                 memory encryption",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "1",
  pages =        "27--33",
  month =        mar,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:37 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "Workshop on Architectural Support for Security and
                 Anti-Virus (WASSA)",
}

@Article{Holland:2005:ADK,
  author =       "David A. Holland and Ada T. Lim and Margo I. Seltzer",
  title =        "An architecture a day keeps the hacker away",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "1",
  pages =        "34--41",
  month =        mar,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:37 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "Workshop on Architectural Support for Security and
                 Anti-Virus (WASSA)",
}

@Article{Sidiroglou:2005:HSS,
  author =       "Stelios Sidiroglou and Michael E. Locasto and Angelos
                 D. Keromytis",
  title =        "Hardware support for self-healing software services",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "1",
  pages =        "42--47",
  month =        mar,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:37 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "Workshop on Architectural Support for Security and
                 Anti-Virus (WASSA)",
}

@Article{Crandall:2005:SAM,
  author =       "Jedidiah R. Crandall and Frederic T. Chong",
  title =        "A security assessment of the {Minos} architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "1",
  pages =        "48--57",
  month =        mar,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:37 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "Workshop on Architectural Support for Security and
                 Anti-Virus (WASSA)",
}

@Article{Burnside:2005:CCP,
  author =       "Matthew Burnside and Angelos D. Keromytis",
  title =        "The case for crypto protocol awareness inside the {OS}
                 kernel",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "1",
  pages =        "58--64",
  month =        mar,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:37 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "Workshop on Architectural Support for Security and
                 Anti-Virus (WASSA)",
}

@Article{Corliss:2005:UDP,
  author =       "Marc L. Corliss and E. Christopher Lewis and Amir
                 Roth",
  title =        "Using {DISE} to protect return addresses from attack",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "1",
  pages =        "65--72",
  month =        mar,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:37 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "Workshop on Architectural Support for Security and
                 Anti-Virus (WASSA)",
}

@Article{Ye:2005:RRA,
  author =       "Dong Ye and David Kaeli",
  title =        "A reliable return address stack: microarchitectural
                 features to defeat stack smashing",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "1",
  pages =        "73--80",
  month =        mar,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:37 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "Workshop on Architectural Support for Security and
                 Anti-Virus (WASSA)",
}

@Article{Inoue:2005:EST,
  author =       "Koji Inoue",
  title =        "Energy-security tradeoff in a secure cache
                 architecture against buffer overflow attacks",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "1",
  pages =        "81--89",
  month =        mar,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:37 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "Workshop on Architectural Support for Security and
                 Anti-Virus (WASSA)",
}

@Article{Uluski:2005:CAW,
  author =       "Derek Uluski and Micha Moffie and David Kaeli",
  title =        "Characterizing antivirus workload execution",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "1",
  pages =        "90--98",
  month =        mar,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:37 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "Workshop on Architectural Support for Security and
                 Anti-Virus (WASSA)",
}

@Article{Aldwairi:2005:CSM,
  author =       "Monther Aldwairi and Thomas Conte and Paul Franzon",
  title =        "Configurable string matching hardware for speeding up
                 intrusion detection",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "1",
  pages =        "99--107",
  month =        mar,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:37 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "Workshop on Architectural Support for Security and
                 Anti-Virus (WASSA)",
}

@Article{Milenkovic:2005:UIB,
  author =       "Milena Milenkovi{\'c} and Aleksandar Milenkovi{\'c}
                 and Emil Jovanov",
  title =        "Using instruction block signatures to counter code
                 injection attacks",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "1",
  pages =        "108--117",
  month =        mar,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:37 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "Workshop on Architectural Support for Security and
                 Anti-Virus (WASSA)",
}

@Article{Zhang:2005:ASP,
  author =       "Youtao Zhang and Jun Yang and Yongjing Lin and Lan
                 Gao",
  title =        "Architectural support for protecting user privacy on
                 trusted processors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "1",
  pages =        "118--123",
  month =        mar,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:37 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "Workshop on Architectural Support for Security and
                 Anti-Virus (WASSA)",
}

@Article{Shirase:2005:AEC,
  author =       "Masaaki Shirase and Yasushi Hibino",
  title =        "An architecture for elliptic curve cryptography
                 computation",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "1",
  pages =        "124--133",
  month =        mar,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:37 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "Workshop on Architectural Support for Security and
                 Anti-Virus (WASSA)",
}

@Article{Kgil:2005:CSS,
  author =       "Taeho Kgil and Laura Falk and Trevor Mudge",
  title =        "{ChipLock}: support for secure microarchitectures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "1",
  pages =        "134--143",
  month =        mar,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:37 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "Workshop on Architectural Support for Security and
                 Anti-Virus (WASSA)",
}

@Article{Ekman:2005:DLC,
  author =       "Magnus Ekman and Fredrik Warg and Jim Nilsson",
  title =        "An in-depth look at computer performance growth",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "1",
  pages =        "144--147",
  month =        mar,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:37 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Venkateswaran:2005:FTB,
  author =       "N. Venkateswaran and S. Balaji and V. Sridhar",
  title =        "Fault tolerant bus architecture for deep submicron
                 based processors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "1",
  pages =        "148--155",
  month =        mar,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:37 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Thorson:2005:INa,
  author =       "Mark Thorson",
  title =        "{Internet} Nuggets",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "1",
  pages =        "156--160",
  month =        mar,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:37 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Lee:2005:APC,
  author =       "Ruby B. Lee and Peter C. S. Kwan and John P. McGregor
                 and Jeffrey Dwoskin and Zhenghong Wang",
  title =        "Architecture for Protecting Critical Secrets in
                 Microprocessors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "2",
  pages =        "2--13",
  month =        may,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:51 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Anonymous:2005:GCM,
  author =       "Anonymous",
  title =        "{General Chair}'s Message",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "2",
  pages =        "9--9",
  month =        may,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:51 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Anonymous:2005:PCM,
  author =       "Anonymous",
  title =        "Program {Chair}'s Message",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "2",
  pages =        "x--xv",
  month =        may,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:51 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Shi:2005:HEC,
  author =       "Weidong Shi and Hsien-Hsin S. Lee and Mrinmoy Ghosh
                 and Chenghuai Lu and Alexandra Boldyreva",
  title =        "High Efficiency Counter Mode Security Architecture via
                 Prediction and Precomputation",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "2",
  pages =        "14--24",
  month =        may,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:51 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Anonymous:2005:C,
  author =       "Anonymous",
  title =        "Committees",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "2",
  pages =        "16--16",
  month =        may,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:51 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Anonymous:2005:R,
  author =       "Anonymous",
  title =        "Reviewers",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "2",
  pages =        "xvii--xviii",
  month =        may,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:51 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Suh:2005:DIA,
  author =       "G. Edward Suh and Charles W. O'Donnell and Ishan
                 Sachdev and Srinivas Devadas",
  title =        "Design and Implementation of the {AEGIS} Single-Chip
                 Secure Processor Using Physical Random Functions",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "2",
  pages =        "25--36",
  month =        may,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:51 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Gurumurthi:2005:DDR,
  author =       "Sudhanva Gurumurthi and Anand Sivasubramaniam and
                 Vivek K. Natarajan",
  title =        "Disk Drive Roadmap from the Thermal Perspective: a
                 Case for Dynamic Thermal Management",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "2",
  pages =        "38--49",
  month =        may,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:51 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Huggahalli:2005:DCA,
  author =       "Ram Huggahalli and Ravi Iyer and Scott Tetrick",
  title =        "Direct Cache Access for High Bandwidth Network {I/O}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "2",
  pages =        "50--59",
  month =        may,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:51 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Gunawi:2005:DCS,
  author =       "Haryadi S. Gunawi and Nitin Agrawal and Andrea C.
                 Arpaci-Dusseau and Remzi H. Arpaci-Dusseau and Jiri
                 Schindler",
  title =        "Deconstructing Commodity Storage Clusters",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "2",
  pages =        "60--71",
  month =        may,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:51 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ekman:2005:RMM,
  author =       "Magnus Ekman and Per Stenstr{\"o}m",
  title =        "A Robust Main-Memory Compression Scheme",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "2",
  pages =        "74--85",
  month =        may,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:51 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Fahs:2005:CO,
  author =       "Brian Fahs and Todd Rafacz and Sanjay J. Patel and
                 Steven S. Lumetta",
  title =        "Continuous Optimization",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "2",
  pages =        "86--97",
  month =        may,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:51 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Petric:2005:RRB,
  author =       "Vlad Petric and Tingting Sha and Amir Roth",
  title =        "{RENO}: a Rename-Based Instruction Optimizer",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "2",
  pages =        "98--109",
  month =        may,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:51 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Tan:2005:HTS,
  author =       "Lin Tan and Timothy Sherwood",
  title =        "A High Throughput String Matching Architecture for
                 Intrusion Detection and Prevention",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "2",
  pages =        "112--122",
  month =        may,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:51 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Baboescu:2005:TBR,
  author =       "Florin Baboescu and Dean M. Tullsen and Grigore Rosu
                 and Sumeet Singh",
  title =        "A Tree Based Router Search Engine Architecture with
                 Single Port Memories",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "2",
  pages =        "123--133",
  month =        may,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:51 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kyo:2005:IMA,
  author =       "Shorin Kyo and Shin'ichiro Okazaki and Tamio Arai",
  title =        "An Integrated Memory Array Processor Architecture for
                 Embedded Image Recognition Systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "2",
  pages =        "134--145",
  month =        may,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:51 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Reis:2005:DEH,
  author =       "George A. Reis and Jonathan Chang and Neil
                 Vachharajani and Ram Rangan and David I. August and
                 Shubhendu S. Mukherjee",
  title =        "Design and Evaluation of Hybrid Fault-Detection
                 Systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "2",
  pages =        "148--159",
  month =        may,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:51 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Schuchman:2005:RMT,
  author =       "Ethan Schuchman and T. N. Vijaykumar",
  title =        "{Rescue}: a Microarchitecture for Testability and
                 Defect Tolerance",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "2",
  pages =        "160--171",
  month =        may,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:51 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Gomaa:2005:OTF,
  author =       "Mohamed A. Gomaa and T. N. Vijaykumar",
  title =        "Opportunistic Transient-Fault Detection",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "2",
  pages =        "172--183",
  month =        may,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:51 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Balensiefer:2005:EFI,
  author =       "Steven Balensiefer and Lucas Kregor-Stickles and Mark
                 Oskin",
  title =        "An Evaluation Framework and Instruction Set
                 Architecture for Ion-Trap Based Quantum
                 Micro-Architectures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "2",
  pages =        "186--196",
  month =        may,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:51 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Nazhandali:2005:EOS,
  author =       "Leyla Nazhandali and Bo Zhai and Javin Olson and Anna
                 Reeves and Michael Minuth and Ryan Helfand and Sanjay
                 Pant and Todd Austin and David Blaauw",
  title =        "Energy Optimization of Subthreshold-Voltage Sensor
                 Network Processors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "2",
  pages =        "197--207",
  month =        may,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:51 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hempstead:2005:ULP,
  author =       "Mark Hempstead and Nikhil Tripathi and Patrick Mauro
                 and Gu-Yeon Wei and David Brooks",
  title =        "An Ultra Low Power System Architecture for Sensor
                 Network Applications",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "2",
  pages =        "208--219",
  month =        may,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:51 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Wenisch:2005:TSS,
  author =       "Thomas F. Wenisch and Stephen Somogyi and Nikolaos
                 Hardavellas and Jangwoo Kim and Anastassia Ailamaki and
                 Babak Falsafi",
  title =        "Temporal Streaming of Shared Memory",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "2",
  pages =        "222--233",
  month =        may,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:51 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Moshovos:2005:REC,
  author =       "Andreas Moshovos",
  title =        "{RegionScout}: Exploiting Coarse Grain Sharing in
                 Snoop-Based Coherence",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "2",
  pages =        "234--245",
  month =        may,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:51 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Cantin:2005:IMP,
  author =       "Jason F. Cantin and Mikko H. Lipasti and James E.
                 Smith",
  title =        "Improving Multiprocessor Performance with Coarse-Grain
                 Coherence Tracking",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "2",
  pages =        "246--257",
  month =        may,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:51 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hines:2005:IPE,
  author =       "Stephen Hines and Joshua Green and Gary Tyson and
                 David Whalley",
  title =        "Improving Program Efficiency by Packing Instructions
                 into Registers",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "2",
  pages =        "260--271",
  month =        may,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:51 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Clark:2005:AFT,
  author =       "Nathan Clark and Jason Blome and Michael Chu and Scott
                 Mahlke and Stuart Biles and Krisztian Flautner",
  title =        "An Architecture Framework for Transparent Instruction
                 Set Customization in Embedded Processors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "2",
  pages =        "272--283",
  month =        may,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:51 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Narayanasamy:2005:BCR,
  author =       "Satish Narayanasamy and Gilles Pokam and Brad Calder",
  title =        "{BugNet}: Continuously Recording Program Execution for
                 Deterministic Replay Debugging",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "2",
  pages =        "284--295",
  month =        may,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:51 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Annavaram:2005:MAL,
  author =       "Murali Annavaram and Ed Grochowski and John Shen",
  title =        "Mitigating {Amdahl's Law} through {EPI} Throttling",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "2",
  pages =        "298--309",
  month =        may,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:51 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "energy per instruction (EPI)",
}

@Article{Talpes:2005:ISP,
  author =       "Emil Talpes and Diana Marculescu",
  title =        "Increased Scalability and Power Efficiency by Using
                 Multiple Speed Pipelines",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "2",
  pages =        "310--321",
  month =        may,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:51 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Petric:2005:EEP,
  author =       "Vlad Petric and Amir Roth",
  title =        "Energy-Effectiveness of Pre-Execution and Energy-Aware
                 {P}-Thread Selection",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "2",
  pages =        "322--333",
  month =        may,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:51 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Zhang:2005:VRM,
  author =       "Michael Zhang and Krste Asanovic",
  title =        "Victim Replication: Maximizing Capacity while Hiding
                 Wire Delay in Tiled Chip Multiprocessors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "2",
  pages =        "336--345",
  month =        may,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:51 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Speight:2005:AMP,
  author =       "Evan Speight and Hazim Shafi and Lixin Zhang and Ram
                 Rajamony",
  title =        "Adaptive Mechanisms and Policies for Managing Cache
                 Hierarchies in Chip Multiprocessors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "2",
  pages =        "346--356",
  month =        may,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:51 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Chishti:2005:ORC,
  author =       "Zeshan Chishti and Michael D. Powell and T. N.
                 Vijaykumar",
  title =        "Optimizing Replication, Communication, and Capacity
                 Allocation in {CMPs}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "2",
  pages =        "357--368",
  month =        may,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:51 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Mutlu:2005:TEP,
  author =       "Onur Mutlu and Hyesoon Kim and Yale N. Patt",
  title =        "Techniques for Efficient Processing in Runahead
                 Execution Engines",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "2",
  pages =        "370--381",
  month =        may,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:51 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Jimenez:2005:PLB,
  author =       "Daniel A. Jimenez",
  title =        "Piecewise Linear Branch Prediction",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "2",
  pages =        "382--393",
  month =        may,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:51 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Seznec:2005:AGH,
  author =       "Andre Seznec",
  title =        "Analysis of the {O-GEometric History Length} Branch
                 Predictor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "2",
  pages =        "394--405",
  month =        may,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:51 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kumar:2005:IMC,
  author =       "Rakesh Kumar and Victor Zyuban and Dean M. Tullsen",
  title =        "Interconnections in Multi-Core Architectures:
                 Understanding Mechanisms, Overheads and Scaling",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "2",
  pages =        "408--419",
  month =        may,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:51 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kim:2005:MHR,
  author =       "John Kim and William J. Dally and Brian Towles and
                 Amit K. Gupta",
  title =        "Microarchitecture of a High-Radix Router",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "2",
  pages =        "420--431",
  month =        may,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:51 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Seo:2005:NOW,
  author =       "Daeho Seo and Akif Ali and Won-Taek Lim and Nauman
                 Rafique and Mithuna Thottethodi",
  title =        "Near-Optimal Worst-Case Throughput Routing for
                 Two-Dimensional Mesh Networks",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "2",
  pages =        "432--443",
  month =        may,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:51 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Gandhi:2005:SLS,
  author =       "Amit Gandhi and Haitham Akkary and Ravi Rajwar and
                 Srikanth T. Srinivasan and Konrad Lai",
  title =        "Scalable Load and Store Processing in Latency Tolerant
                 Processors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "2",
  pages =        "446--457",
  month =        may,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:51 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Roth:2005:SVW,
  author =       "Amir Roth",
  title =        "{Store Vulnerability Window (SVW)}: Re-Execution
                 Filtering for Enhanced Load Optimization",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "2",
  pages =        "458--468",
  month =        may,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:51 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Torres:2005:SBD,
  author =       "E. F. Torres and P. Ibanez and V. Vinals and J. M.
                 Llaberia",
  title =        "Store Buffer Design in First-Level Multibanked Data
                 Caches",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "2",
  pages =        "469--480",
  month =        may,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:51 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Meixner:2005:DVS,
  author =       "Albert Meixner and Daniel J. Sorin",
  title =        "Dynamic Verification of Sequential Consistency",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "2",
  pages =        "482--493",
  month =        may,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:51 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Rajwar:2005:VTM,
  author =       "Ravi Rajwar and Maurice Herlihy and Konrad Lai",
  title =        "Virtualizing Transactional Memory",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "2",
  pages =        "494--505",
  month =        may,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:51 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Balakrishnan:2005:IPA,
  author =       "Saisanthosh Balakrishnan and Ravi Rajwar and Mike
                 Upton and Konrad Lai",
  title =        "The Impact of Performance Asymmetry in Emerging
                 Multicore Architectures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "2",
  pages =        "506--517",
  month =        may,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:51 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Srinivasan:2005:ESD,
  author =       "Jayanth Srinivasan and Sarita V. Adve and Pradip Bose
                 and Jude A. Rivers",
  title =        "Exploiting Structural Duplication for Lifetime
                 Reliability Enhancement",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "2",
  pages =        "520--531",
  month =        may,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:51 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Biswas:2005:CAV,
  author =       "Arijit Biswas and Paul Racunas and Razvan Cheveresan
                 and Joel Emer and Shubhendu S. Mukherjee and Ram
                 Rangan",
  title =        "Computing Architectural Vulnerability Factors for
                 Address-Based Structures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "2",
  pages =        "532--543",
  month =        may,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:51 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Qureshi:2005:VWC,
  author =       "Moinuddin K. Qureshi and David Thompson and Yale N.
                 Patt",
  title =        "The {V-Way Cache}: Demand Based Associativity via
                 Global Replacement",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "2",
  pages =        "544--555",
  month =        may,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:51 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Anonymous:2005:AI,
  author =       "Anonymous",
  title =        "Author Index",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "2",
  pages =        "556--557",
  month =        may,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:40:51 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Bartolini:2005:GEI,
  author =       "S. Bartolini and P. Foglia and C. A. Prete",
  title =        "{Guests editors'} introduction",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "3",
  pages =        "1--2",
  month =        jun,
  year =         "2005",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1152922.1101870",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 12:06:44 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "In this issue of ACM SigArch Newsletter, we present
                 eight papers from the MEDEA Workshop, held in
                 conjunction with the International Conference on
                 Parallel Architectures and Compilation Techniques
                 (PACT-2004) [1], [2].",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Fradj:2005:EAM,
  author =       "Hanene Ben Fradj and Asmaa el Ouardighi and C{\'e}cile
                 Belleudy and Michel Auguin",
  title =        "Energy aware memory architecture configuration",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "3",
  pages =        "3--9",
  month =        jun,
  year =         "2005",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1152922.1101871",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 12:06:44 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "In the context of battery-driven embedded systems,
                 reducing energy while maintaining performance is one of
                 today's challenges. The on-chip memory count for a
                 great part of the whole system consumption, especially
                 for images and video processing applications that make
                 heavy use of large memory data size. In this paper, we
                 present new technique for efficiently exploiting
                 on-chip memory space (cache, scratchpad) for a specific
                 application to reduce the energy consumption without
                 loss of performance. We configure and compare the
                 impact of three different memory architectures on the
                 energy consumption. The first one is composed of main
                 memory with cache, in the second architecture we find a
                 main memory and scratchpad memory and in the last
                 architecture we combine both cache and scratchpad with
                 the main memory. We show the effectiveness of the last
                 architecture and a saving about 35\% in energy
                 consumption.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Suh:2005:DOC,
  author =       "Hyo-Joong Suh and Sung Woo Chung",
  title =        "{DRACO}: optimized {CC-NUMA} system with novel
                 dual-link interconnections to reduce the memory
                 latency",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "3",
  pages =        "10--16",
  month =        jun,
  year =         "2005",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1152922.1101872",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 12:06:44 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "The performances of multiprocessor systems mainly rely
                 on the processor clock speed and the memory latency. As
                 the processors speed up rapidly, the memory latency
                 becomes a major performance bottleneck in
                 multiprocessor systems. In this paper, we propose a
                 dual-link interconnection topology and its effective
                 routing scheme to reduce the remote memory latency on
                 the interconnection network. It can be applied at a
                 same implementation cost as traditional bi-directional
                 ring systems. We compare the performance of the
                 proposed system to that of the traditional
                 bi-directional ring-based system and toroidal
                 mesh-based system. By simulations, it is shown that the
                 proposed system outperforms the traditional
                 bi-directional ring-based system by 42~101 \% and
                 excels the toroidal mesh-based system by 4~14\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Yehia:2005:LSA,
  author =       "Sami Yehia and Jean-Fran{\c{c}}ois Collard and Olivier
                 Temam",
  title =        "Load squared: adding logic close to memory to reduce
                 the latency of indirect loads with high miss ratios",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "3",
  pages =        "17--24",
  month =        jun,
  year =         "2005",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1152922.1101873",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 12:06:44 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Indirect memory accesses, where a load is fed by
                 another load, are ubiquitous because of rich data
                 structures and sophisticated software conventions, such
                 as the use of linkage tables and position independent
                 code. Unfortunately, they can be costly: if both loads
                 miss, two round trips to memory are required even
                 though the role of the first load is often limited to
                 fetching the address of the second load. To reduce the
                 total latency of such indirect accesses, a new
                 instruction called load squared is introduced. A load
                 squared does two fetches, the first fetch reading the
                 target address of the second. (An offset is optionally
                 added to the result of the first fetch.) The load
                 squared operation is performed by memory-side logic
                 (typically, the memory controller if it isn't located
                 on the main processor chip). In this study, load
                 squared is not an architecturally visible instruction:
                 the micro-architecture transparently decides which
                 loads should be replaced by loads squared. We show that
                 performance is sometimes improved significantly, and
                 never degraded.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kobayashi:2005:LAC,
  author =       "Hiroaki Kobayashi and Isao Kotera and Hiroyuki
                 Takizawa",
  title =        "Locality analysis to control dynamically way-adaptable
                 caches",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "3",
  pages =        "25--32",
  month =        jun,
  year =         "2005",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1152922.1101874",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 12:06:44 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "This paper presents a control mechanism for
                 dynamically way-adaptable caches. The mechanism uses
                 the local and global information about the locality of
                 reference during execution. As the local information,
                 the cache access pattern is evaluated based on the
                 statistics of the LRU (Least-Recently Used) states of
                 cache entries referenced. If the memory accesses are
                 concentrated on and near the most recently used
                 entries, the mechanism knows that the locality of
                 reference is very high and there is room to decrease
                 the number of ways activated to fit the current
                 locality. On the other hand, if the accesses are widely
                 distributed from the most recently used entries to the
                 least recently used ones, the mechanism understands
                 that more ways are needed to improve the performance as
                 long as the resources are available. In addition, to
                 examine the global behavior of the locality of
                 reference, an n-bit state machine like n-bit branch
                 predictors is introduced into the mechanism. The state
                 machine traces a sequence of cache resizing requests
                 and evaluates its stability across the execution time.
                 Therefore, the state machine helps the mechanism avoid
                 unstable actions for enabling/disabling cache ways when
                 the locality shows the highly irregular behavior. The
                 experimental results indicate that an n-bit asymmetric
                 state machine using the LRU status information works
                 well to appropriately control cache ways even in the
                 case of the benchmarks with highly-irregular access
                 behaviors in cache references.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Arakawa:2005:SXE,
  author =       "F. Arakawa and M. Ishikawa and Y. Kondo and T. Kamei
                 and M. Ozawa and O. Nishii and T. Hattori",
  title =        "{SH-X}: an embedded processor core for consumer
                 appliances",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "3",
  pages =        "33--40",
  month =        jun,
  year =         "2005",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1152922.1101875",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 12:06:44 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "A SuperH\TM{} embedded processor core SH-X implemented
                 in a 130-nm CMOS process running at 400 MHz achieved
                 720 MIPS and 2.8 GFLOPS at a power of 250 mW under
                 worst-case conditions. It has a dual-issue seven-stage
                 pipeline architecture, but reaches the 1.8 MIPS/MHz of
                 the previous five-stage processor. The on-chip memory
                 configuration is tuned for digital consumer appliances.
                 A new resume-standby mode enables a standby current of
                 less than 100$ \mu $A and a 3-ms recovery time. The
                 processor meets the requirements of a wide range of
                 applications, and is suitable for digital appliances
                 aimed at the consumer market, such as cellular phones,
                 digital still/video cameras, and car navigation
                 systems.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Naz:2005:IDC,
  author =       "Afrin Naz and Mehran Rezaei and Krishna Kavi and
                 Philip Sweany",
  title =        "Improving data cache performance with integrated use
                 of split caches, victim cache and stream buffers",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "3",
  pages =        "41--48",
  month =        jun,
  year =         "2005",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1152922.1101876",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 12:06:44 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "In our prior work we explored a cache organization
                 providing architectural support for distinguishing
                 between memory references that exhibit spatial and
                 temporal locality and mapping them to separate caches.
                 That work showed that using separate (data) caches for
                 indexed or stream data and scalar data items could lead
                 to substantial improvements in terms of cache misses.
                 In addition, such a separation allowed for the design
                 of caches that could be tailored to meet the properties
                 exhibited by different data items. In this paper, we
                 investigate the interaction between three established
                 methods: split cache, victim cache and stream buffer.
                 Since significant amounts of compulsory and conflict
                 misses are avoided, the size of each cache (i.e., array
                 and scalar), as well as the combined cache capacity can
                 be reduced. Our results show that on average 55\%
                 reduction in miss rates over the base configuration.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "array cache; memory access time; scalar cache; stream
                 buffer; victim cache",
}

@Article{Pajuelo:2005:SEH,
  author =       "Alex Pajuelo and Antonio Gonz{\'a}lez and Mateo
                 Valero",
  title =        "Speculative execution for hiding memory latency",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "3",
  pages =        "49--56",
  month =        jun,
  year =         "2005",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1152922.1101877",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 12:06:44 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "L2 misses are one of the main causes for stalling the
                 activity in current and future microprocessors. In this
                 paper we present a mechanism to speculatively execute
                 independent instructions of L2-miss loads, even if no
                 entry in the reorder buffer is available. The proposed
                 mechanism generates future instances of instructions
                 that are expected to be independent of the delinquent
                 load. When these dynamic instructions are later
                 fetched, they use the previously precomputed data and
                 directly go to the commit stage without executing. The
                 mechanism replicates strided loads found above the
                 L2-miss load, that produce the data for the target
                 independent instructions. Instructions following the
                 L2-miss load will check if their source operands have
                 been replicated. In this case, multiple speculative
                 instances of them will also be generated. This
                 mechanism is built on top of a superscalar processor
                 with an aggressive prefetch scheme. Compared to this
                 baseline, the mechanism obtains 21\% of performance
                 improvement.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Verdu:2005:ITA,
  author =       "Javier Verd{\'u} and Jorge Garc{\'\i}a and Mario
                 Nemirovsky and Mateo Valero",
  title =        "The impact of traffic aggregation on the memory
                 performance of networking applications",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "3",
  pages =        "57--62",
  month =        jun,
  year =         "2005",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1152922.1101878",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 12:06:44 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "The trend of the networking processing is to increase
                 the intelligence of the routers (i.e. security
                 capacities). This means that there is an increment in
                 the workload generated per packet and new types of
                 applications are emerging, such as stateful programs.
                 On the other hand, Internet traffic continues to grow
                 vigorously. This fact involves an increment of the
                 traffic aggregation levels and overloads the processing
                 capacities of the routers. In this paper we show the
                 importance of traffic aggregation level on networking
                 application studies. We also classify the applications
                 according to the data management of the packet
                 processing. Hence, we present the different impacts on
                 the data cache performance depending on the application
                 category. Our results show that traffic aggregation
                 level may affect the cache performance depending on the
                 networking application category. Stateful applications
                 show a significant sensitivity to this impact.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Allu:2005:ERC,
  author =       "Bramha Allu and Wei Zhang",
  title =        "Exploiting the replication cache to improve
                 performance for multiple-issue microprocessors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "3",
  pages =        "63--71",
  month =        jun,
  year =         "2005",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1101868.1101880",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 12:06:44 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Performance and reliability are both of great
                 importance for microprocessor design. Recently, the
                 replication cache has been proposed to enhance data
                 cache reliability against soft errors. The replication
                 cache is a small fully associative cache to store the
                 replica for every write to the L1 data cache. In
                 addition to enhance data reliability, this paper
                 proposes several cost-effective techniques to improve
                 performance of multiple-issue microprocessors by
                 exploiting the replication cache. The idea is to make
                 use of the replication cache to increase cache
                 bandwidth through dual load and to reduce the L1 data
                 cache miss rate through partial victim caching. Built
                 upon these two schemes, we also propose a hybrid
                 approach to combine the benefits of both dual load and
                 partial victim caching for improving performance
                 further. Our experimental results show that exploiting
                 a replication cache with only 8 entries can improve
                 performance by 13.0\% on average without compromising
                 the enhanced data integrity.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Thorson:2005:INb,
  author =       "Mark Thorson",
  title =        "{Internet} nuggets",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "3",
  pages =        "72--74",
  month =        jun,
  year =         "2005",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1101868.1101882",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 12:06:44 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "This column consists of selected traffic from the
                 comp.arch newsgroup, a forum for discussion of computer
                 architecture on the Internet---an international
                 computer network.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Anonymous:2005:MW,
  author =       "Anonymous",
  title =        "{MEDEA 2004} workshop",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "3",
  pages =        "??--??",
  month =        jun,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 12:06:44 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Jouppi:2005:ISI,
  author =       "Norman P. Jouppi and Rakesh Kumar and Dean Tullsen",
  title =        "Introduction to the special issue on the {2005
                 Workshop on Design, Analysis, and Simulation of Chip
                 Multiprocessors (dasCMP'05)}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "4",
  pages =        "4--4",
  month =        nov,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:08 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "Special issue: dasCMP'05.",
}

@Article{Laudon:2005:PWN,
  author =       "James Laudon",
  title =        "Performance\slash Watt: the new server focus",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "4",
  pages =        "5--13",
  month =        nov,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:08 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "Special issue: dasCMP'05.",
}

@Article{Davis:2005:RRA,
  author =       "John D. Davis and Cong Fu and James Laudon",
  title =        "The {RASE (Rapid, Accurate Simulation Environment)}
                 for chip multiprocessors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "4",
  pages =        "14--23",
  month =        nov,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:08 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "Special issue: dasCMP'05.",
}

@Article{Hsu:2005:ECD,
  author =       "Lisa Hsu and Ravi Iyer and Srihari Makineni and Steve
                 Reinhardt and Donald Newell",
  title =        "Exploring the cache design space for large scale
                 {CMPs}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "4",
  pages =        "24--33",
  month =        nov,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:08 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "Special issue: dasCMP'05.",
}

@Article{Davis:2005:CPS,
  author =       "John D. Davis and Stephen E. Richardson and Charis
                 Charitsis and Kunle Olukotun",
  title =        "A chip prototyping substrate: the flexible
                 architecture for simulation and testing {(FAST)}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "4",
  pages =        "34--43",
  month =        nov,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:08 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "Special issue: dasCMP'05.",
}

@Article{Vachharajani:2005:CMP,
  author =       "Neil Vachharajani and Matthew Iyer and Chinmay Ashok
                 and Manish Vachharajani and David I. August and Daniel
                 Connors",
  title =        "Chip multi-processor scalability for single-threaded
                 applications",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "4",
  pages =        "44--53",
  month =        nov,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:08 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "Special issue: dasCMP'05.",
}

@Article{Chen:2005:HMP,
  author =       "Julia Chen and Philo Juang and Kevin Ko and Gilberto
                 Contreras and David Penry and Ram Rangan and Adam
                 Stoler and Li-Shiuan Peh and Margaret Martonosi",
  title =        "Hardware-modulated parallelism in chip
                 multiprocessors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "4",
  pages =        "54--63",
  month =        nov,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:08 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "Special issue: dasCMP'05.",
}

@Article{Sampson:2005:FSC,
  author =       "Jack Sampson and Rub{\'e}n Gonz{\'a}lez and
                 Jean-Fran{\c{c}}ois Collard and Norman P. Jouppi and
                 Mike Schlansker",
  title =        "Fast synchronization for chip multiprocessors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "4",
  pages =        "64--69",
  month =        nov,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:08 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "Special issue: dasCMP'05.",
}

@Article{Shayesteh:2005:DCS,
  author =       "Anahita Shayesteh and Glenn Reinman and Norman Jouppi
                 and Suleyman Sair and Tim Sherwood",
  title =        "Dynamically configurable shared {CMP} helper engines
                 for improved performance",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "4",
  pages =        "70--79",
  month =        nov,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:08 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "Special issue: dasCMP'05.",
}

@Article{Constantinou:2005:PIS,
  author =       "Theofanis Constantinou and Yiannakis Sazeides and
                 Pierre Michaud and Damien Fetis and Andre Seznec",
  title =        "Performance implications of single thread migration on
                 a chip multi-core",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "4",
  pages =        "80--91",
  month =        nov,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:08 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "Special issue: dasCMP'05.",
}

@Article{Martin:2005:MGE,
  author =       "Milo M. K. Martin and Daniel J. Sorin and Bradford M.
                 Beckmann and Michael R. Marty and Min Xu and Alaa R.
                 Alameldeen and Kevin E. Moore and Mark D. Hill and
                 David A. Wood",
  title =        "Multifacet's general execution-driven multiprocessor
                 simulator {(GEMS)} toolset",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "4",
  pages =        "92--99",
  month =        nov,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:08 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Wang:2005:DMS,
  author =       "David Wang and Brinda Ganesh and Nuengwong Tuaycharoen
                 and Kathleen Baynes and Aamer Jaleel and Bruce Jacob",
  title =        "{DRAMsim}: a memory system simulator",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "4",
  pages =        "100--107",
  month =        nov,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:08 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Rountree:2005:NH,
  author =       "Barry Rountree and Robert Springer and David K.
                 Lowenthal and Vincent W. Freeh",
  title =        "Notes from {HPPAC 2005}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "4",
  pages =        "108--112",
  month =        nov,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:08 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Wang:2005:GFB,
  author =       "H. C. Wang and C. K. Yuen",
  title =        "A general framework to build new {CPUs} by mapping
                 abstract machine code to instruction level parallel
                 execution hardware",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "4",
  pages =        "113--120",
  month =        nov,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:08 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Sam:2005:IMS,
  author =       "Nana B. Sam and Martin Burtscher",
  title =        "Improving memory system performance with
                 energy-efficient value speculation",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "4",
  pages =        "121--127",
  month =        nov,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:08 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Thorson:2005:INc,
  author =       "Mark Thorson",
  title =        "{Internet} Nuggets",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "4",
  pages =        "128--133",
  month =        nov,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:08 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kaeli:2005:WIS,
  author =       "David Kaeli and Robert Cohn",
  title =        "{WBIA'05}: Introduction to the special issue",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "5",
  pages =        "1--2",
  month =        dec,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:24 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "WBIA'05",
}

@Article{Hu:2005:CCI,
  author =       "Chunling Hu and John McCabe and Daniel A. Jim{\'e}nez
                 and Ulrich Kremer",
  title =        "The {Camino Compiler} infrastructure",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "5",
  pages =        "3--8",
  month =        dec,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:24 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "WBIA'05",
}

@Article{Schulz:2005:SDB,
  author =       "Martin Schulz and Dong Ahn and Andrew Bernat and
                 Bronis R. de Supinski and Steven Y. Ko and Gregory Lee
                 and Barry Rountree",
  title =        "Scalable dynamic binary instrumentation for {Blue
                 Gene/L}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "5",
  pages =        "9--14",
  month =        dec,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:24 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "WBIA'05",
}

@Article{Borin:2005:DBC,
  author =       "Edson Borin and Cheng Wang and Youfeng Wu and Guido
                 Araujo",
  title =        "Dynamic binary control-flow errors detection",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "5",
  pages =        "15--20",
  month =        dec,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:24 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "WBIA'05",
}

@Article{Moffie:2005:AAS,
  author =       "Micha Moffie and David Kaeli",
  title =        "{ASM}: application security monitor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "5",
  pages =        "21--26",
  month =        dec,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:24 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "WBIA'05",
}

@Article{Zhao:2005:DMO,
  author =       "Qin Zhao and Rodric Rabbah and Weng-Fai Wong",
  title =        "Dynamic memory optimization using pool allocation and
                 prefetching",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "5",
  pages =        "27--32",
  month =        dec,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:24 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "WBIA'05",
}

@Article{Gao:2005:AAL,
  author =       "Xiaofeng Gao and Beth Simon and Allan Snavely",
  title =        "{ALITER}: an asynchronous lightweight instrumentation
                 tool for event recording",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "5",
  pages =        "33--38",
  month =        dec,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:24 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "WBIA'05",
}

@Article{McCurdy:2005:UPM,
  author =       "Collin McCurdy and Charles Fischer",
  title =        "Using {Pin} as a memory reference generator for
                 multiprocessor simulation",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "5",
  pages =        "39--44",
  month =        dec,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:24 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "WBIA'05",
}

@Article{Pan:2005:CPE,
  author =       "Heidi Pan and Krste Asanovi{\'c} and Robert Cohn and
                 Chi-Keung Luk",
  title =        "Controlling program execution through binary
                 instrumentation",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "5",
  pages =        "45--50",
  month =        dec,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:24 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "WBIA'05",
}

@Article{Faroughi:2005:PPP,
  author =       "Nikrouz Faroughi",
  title =        "Profiling of parallel processing programs on shared
                 memory multiprocessors using {Simics}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "5",
  pages =        "51--56",
  month =        dec,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:24 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "WBIA'05",
}

@Article{Kumar:2005:TDD,
  author =       "Naveen Kumar and Ramesh Peri",
  title =        "Transparent debugging of dynamically instrumented
                 programs",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "5",
  pages =        "57--62",
  month =        dec,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:24 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "WBIA'05",
}

@Article{Harris:2005:PAS,
  author =       "Laune C. Harris and Barton P. Miller",
  title =        "Practical analysis of stripped binary code",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "5",
  pages =        "63--68",
  month =        dec,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:24 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "WBIA'05",
}

@Article{Reddi:2005:PDC,
  author =       "Vijay Janapa Reddi and Dan Connors and Robert S.
                 Cohn",
  title =        "Persistence in dynamic code transformation systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "5",
  pages =        "69--74",
  month =        dec,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:24 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "WBIA'05",
}

@Article{Srinivasan:2005:MMC,
  author =       "Ram Srinivasan and Olaf Lubeck",
  title =        "{MonteSim}: a {Monte Carlo} performance model for
                 in-order microarchitectures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "5",
  pages =        "75--80",
  month =        dec,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:24 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "WBIA'05",
}

@Article{Laurenzano:2005:LCT,
  author =       "Michael Laurenzano and Beth Simon and Allan Snavely
                 and Meghan Gunn",
  title =        "Low cost trace-driven memory simulation using
                 {SimPoint}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "5",
  pages =        "81--86",
  month =        dec,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:24 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "WBIA'05",
}

@Article{Thorson:2005:INd,
  author =       "Mark Thorson",
  title =        "{Internet} Nuggets",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "33",
  number =       "5",
  pages =        "87--93",
  month =        dec,
  year =         "2005",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 12 09:41:24 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Bartolini:2006:MPD,
  author =       "S. Bartolini and P. Foglia and R. Giorgi and C. A.
                 Prete",
  title =        "Memory performance: dealing with applications, systems
                 and architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "34",
  number =       "1",
  pages =        "1--2",
  year =         "2006",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Aug 21 15:00:05 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Friedman:2006:DCR,
  author =       "Scott Friedman and Praveen Krishnamurthy and Roger
                 Chamberlain and Ron K. Cytron and Jason E. Fritts",
  title =        "Dusty caches for reference counting garbage
                 collection",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "34",
  number =       "1",
  pages =        "3--10",
  year =         "2006",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Aug 21 15:00:05 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ramaswamy:2006:DTC,
  author =       "Subramanian Ramaswamy and Jaswanth Sreeram and
                 Sudhakar Yalamanchili and Krishna V. Palem",
  title =        "Data trace cache: an application specific cache
                 architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "34",
  number =       "1",
  pages =        "11--18",
  year =         "2006",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Aug 21 15:00:05 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Naz:2006:MCS,
  author =       "Afrin Naz and Krishna Kavi and Mehran Rezaei and
                 Wentong Li",
  title =        "Making a case for split data caches for embedded
                 applications",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "34",
  number =       "1",
  pages =        "19--26",
  year =         "2006",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Aug 21 15:00:05 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Allu:2006:ERC,
  author =       "B. Allu and W. Zhang and M. Kandala",
  title =        "Exploiting the replication cache to improve cache read
                 bandwidth cost effectively",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "34",
  number =       "1",
  pages =        "27--32",
  year =         "2006",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Aug 21 15:00:05 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Monchiero:2006:EST,
  author =       "Matteo Monchiero and Gianluca Palermo and Cristina
                 Silvano and Oreste Villa",
  title =        "An efficient synchronization technique for
                 multiprocessor systems on-chip",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "34",
  number =       "1",
  pages =        "33--40",
  year =         "2006",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Aug 21 15:00:05 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Khunjush:2006:HMD,
  author =       "Farshad Khunjush and Nikitas J. Dimopoulos",
  title =        "Hiding message delivery and reducing memory access
                 latency by providing direct-to-cache transfer during
                 receive operations in a message passing environment",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "34",
  number =       "1",
  pages =        "41--48",
  year =         "2006",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Aug 21 15:00:05 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Yue:2006:NCB,
  author =       "Yao Yue and Chuang Lin and Zhangxi Tan",
  title =        "{NPCryptBench}: a cryptographic benchmark suite for
                 network processors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "34",
  number =       "1",
  pages =        "49--56",
  year =         "2006",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Aug 21 15:00:05 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Lopez-Lagunas:2006:MBO,
  author =       "Abelardo L{\'o}pez-Lagunas and Sek M. Chai",
  title =        "Memory bandwidth optimization through stream
                 descriptors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "34",
  number =       "1",
  pages =        "57--64",
  year =         "2006",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Aug 21 15:00:05 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Chiyonobu:2006:EEI,
  author =       "Akihiro Chiyonobu and Toshinori Sato",
  title =        "Energy-efficient instruction scheduling utilizing
                 cache miss information",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "34",
  number =       "1",
  pages =        "65--70",
  year =         "2006",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Aug 21 15:00:05 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Bardine:2006:AEV,
  author =       "Alessandro Bardine and Alessio Bechini and
                 Pierfrancesco Foglia and Cosimo Antonio Prete",
  title =        "Analysis of embedded video coder systems: a
                 system-level approach",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "34",
  number =       "1",
  pages =        "71--76",
  year =         "2006",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Aug 21 15:00:05 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Gontmakher:2006:ILG,
  author =       "Alex Gontmakher and Assaf Schuster and Avi Mendelson",
  title =        "{Inthreads}: a low granularity parallelization model",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "34",
  number =       "1",
  pages =        "77--80",
  year =         "2006",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Aug 21 15:00:05 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Thorson:2006:INa,
  author =       "Mark Thorson",
  title =        "{Internet} nuggets",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "34",
  number =       "1",
  pages =        "81--86",
  year =         "2006",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Aug 21 15:00:05 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Patt:2006:CAR,
  author =       "Yale Patt",
  title =        "Computer Architecture Research and Future
                 Microprocessors: Where Do We Go from Here?",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "34",
  number =       "2",
  pages =        "2--2",
  year =         "2006",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Aug 21 15:00:05 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kim:2006:GDE,
  author =       "Jongman Kim and Chrysostomos Nicopoulos and Dongkook
                 Park",
  title =        "A Gracefully Degrading and Energy-Efficient Modular
                 Router Architecture for On-Chip Networks",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "34",
  number =       "2",
  pages =        "4--15",
  year =         "2006",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Aug 21 15:00:05 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Anonymous:2006:MGC,
  author =       "Anonymous",
  title =        "Message from the General Chair",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "34",
  number =       "2",
  pages =        "10--10",
  year =         "2006",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Aug 21 15:00:05 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Anonymous:2006:MPC,
  author =       "Anonymous",
  title =        "Message from the Program Chair",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "34",
  number =       "2",
  pages =        "11--11",
  year =         "2006",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Aug 21 15:00:05 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Anonymous:2006:R,
  author =       "Anonymous",
  title =        "Reviewers",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "34",
  number =       "2",
  pages =        "14--14",
  year =         "2006",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Aug 21 15:00:05 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Scott:2006:BHR,
  author =       "Steve Scott and Dennis Abts and John Kim and William
                 J. Dally",
  title =        "The {BlackWidow} High-Radix {Clos} Network",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "34",
  number =       "2",
  pages =        "16--28",
  year =         "2006",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Aug 21 15:00:05 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Anonymous:2006:SG,
  author =       "Anonymous",
  title =        "{SIGARCH} Guidelines",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "34",
  number =       "2",
  pages =        "17--17",
  year =         "2006",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Aug 21 15:00:05 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Arvind:2006:MMI,
  author =       "Arvind Arvind and Jan-Willem Maessen",
  title =        "Memory Model $=$ Instruction Reordering $+$ Store
                 Atomicity",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "34",
  number =       "2",
  pages =        "29--40",
  year =         "2006",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Aug 21 15:00:05 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{vonPraun:2006:CMO,
  author =       "Christoph von Praun and Harold W. Cain and Jong-Deok
                 Choi and Kyung Dong Ryu",
  title =        "Conditional Memory Ordering",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "34",
  number =       "2",
  pages =        "41--52",
  year =         "2006",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Aug 21 15:00:05 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{McDonald:2006:ASP,
  author =       "Austen McDonald and JaeWoong Chung and Brian D.
                 Carlstrom and Chi Cao Minh and Hassan Chafi and
                 Christos Kozyrakis and Kunle Olukotun",
  title =        "Architectural Semantics for Practical Transactional
                 Memory",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "34",
  number =       "2",
  pages =        "53--65",
  year =         "2006",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Aug 21 15:00:05 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ranganathan:2006:ELP,
  author =       "Parthasarathy Ranganathan and Phil Leech and David
                 Irwin and Jeffrey Chase",
  title =        "Ensemble-level Power Management for Dense Blade
                 Servers",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "34",
  number =       "2",
  pages =        "66--77",
  year =         "2006",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Aug 21 15:00:05 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Donald:2006:TMT,
  author =       "James Donald and Margaret Martonosi",
  title =        "Techniques for Multicore Thermal Management:
                 Classification and New Exploration",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "34",
  number =       "2",
  pages =        "78--88",
  year =         "2006",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Aug 21 15:00:05 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Lin:2006:SLP,
  author =       "Yuan Lin and Hyunseok Lee and Mark Woh and Yoav Harel
                 and Scott Mahlke and Trevor Mudge and Chaitali
                 Chakrabarti and Krisztian Flautner",
  title =        "{SODA}: a Low-power Architecture For Software Radio",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "34",
  number =       "2",
  pages =        "89--101",
  year =         "2006",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Aug 21 15:00:05 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Shi:2006:IFD,
  author =       "Weidong Shi and Hsien-Hsin S. Lee and Laura `Falk and
                 Mrinmoy Ghosh",
  title =        "An Integrated Framework for Dependable and Revivable
                 Architectures Using Multicore Processors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "34",
  number =       "2",
  pages =        "102--113",
  year =         "2006",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Aug 21 15:00:05 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hankins:2006:MIS,
  author =       "Richard A. Hankins and Gautham N. Chinya and Jamison
                 D. Collins and Perry H. Wang and Ryan Rakvic and Hong
                 Wang and John P. Shen",
  title =        "Multiple Instruction Stream Processor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "34",
  number =       "2",
  pages =        "114--127",
  year =         "2006",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Aug 21 15:00:05 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Emma:2006:ESR,
  author =       "Philip Emma",
  title =        "The End of Scaling? Revolutions in Technology and
                 Microarchitecture as We Pass the 90 Nanometer Node",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "34",
  number =       "2",
  pages =        "128--128",
  year =         "2006",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Aug 21 15:00:05 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Li:2006:DMC,
  author =       "Feihui Li and Chrysostomos Nicopoulos and Thomas
                 Richardson and Yuan Xie and Vijaykrishnan Narayanan and
                 Mahmut Kandemir",
  title =        "Design and Management of {$3$D} Chip Multiprocessors
                 Using Network-in-Memory",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "34",
  number =       "2",
  pages =        "130--141",
  year =         "2006",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Aug 21 15:00:05 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Garg:2006:SMD,
  author =       "Alok Garg and M. Wasiur Rashid and Michael Huang",
  title =        "Slackened Memory Dependence Enforcement: Combining
                 Opportunistic Forwarding with Decoupled Verification",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "34",
  number =       "2",
  pages =        "142--154",
  year =         "2006",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Aug 21 15:00:05 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Zhang:2006:BCR,
  author =       "Chuanjun Zhang",
  title =        "Balanced Cache: Reducing Conflict Misses of
                 Direct-Mapped Caches",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "34",
  number =       "2",
  pages =        "155--166",
  year =         "2006",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Aug 21 15:00:05 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Qureshi:2006:CMA,
  author =       "Moinuddin K. Qureshi and Daniel N. Lynch and Onur
                 Mutlu and Yale N. Patt",
  title =        "A Case for {MLP}-Aware Cache Replacement",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "34",
  number =       "2",
  pages =        "167--178",
  year =         "2006",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Aug 21 15:00:05 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Yan:2006:ICP,
  author =       "Chenyu Yan and Daniel Englender and Milos Prvulovic
                 and Brian Rogers and Yan Solihin",
  title =        "Improving Cost, Performance, and Security of Memory
                 Encryption and Authentication",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "34",
  number =       "2",
  pages =        "179--190",
  year =         "2006",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Aug 21 15:00:05 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Brodie:2006:SAH,
  author =       "Benjamin C. Brodie and David E. Taylor and Ron K.
                 Cytron",
  title =        "A Scalable Architecture For High-Throughput
                 Regular-Expression Pattern Matching",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "34",
  number =       "2",
  pages =        "191--202",
  year =         "2006",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Aug 21 15:00:05 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hasan:2006:CSE,
  author =       "Jahangir Hasan and Srihari Cadambi and Venkatta
                 Jakkula and Srimat Chakradhar",
  title =        "{Chisel}: a Storage-efficient, Collision-free
                 Hash-based Network Processing Architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "34",
  number =       "2",
  pages =        "203--215",
  year =         "2006",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Aug 21 15:00:05 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Colohan:2006:TDB,
  author =       "Christopher B. Colohan and Anastassia Ailamaki and J.
                 Gregory Steffan and Todd C. Mowry",
  title =        "Tolerating Dependences Between Large Speculative
                 Threads Via Sub-Threads",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "34",
  number =       "2",
  pages =        "216--226",
  year =         "2006",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Aug 21 15:00:05 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ceze:2006:BDS,
  author =       "Luis Ceze and James Tuck and Josep Torrellas and Calin
                 Cascaval",
  title =        "Bulk Disambiguation of Speculative Threads in
                 Multiprocessors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "34",
  number =       "2",
  pages =        "227--238",
  year =         "2006",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Aug 21 15:00:05 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Choi:2006:LBS,
  author =       "Seungryul Choi and Donald Yeung",
  title =        "Learning-Based {SMT} Processor Resource Distribution
                 via Hill-Climbing",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "34",
  number =       "2",
  pages =        "239--251",
  year =         "2006",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Aug 21 15:00:05 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Somogyi:2006:SMS,
  author =       "Stephen Somogyi and Thomas F. Wenisch and Anastassia
                 Ailamaki and Babak Falsafi and Andreas Moshovos",
  title =        "Spatial Memory Streaming",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "34",
  number =       "2",
  pages =        "252--263",
  year =         "2006",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Aug 21 15:00:05 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Chang:2006:CCC,
  author =       "Jichuan Chang and Gurindar S. Sohi",
  title =        "Cooperative Caching for Chip Multiprocessors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "34",
  number =       "2",
  pages =        "264--276",
  year =         "2006",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Aug 21 15:00:05 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hu:2006:RST,
  author =       "Shiliang Hu and James E. Smith",
  title =        "Reducing Startup Time in Co-Designed Virtual
                 Machines",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "34",
  number =       "2",
  pages =        "277--288",
  year =         "2006",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Aug 21 15:00:05 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Yang:2006:TAD,
  author =       "Qing Yang and Weijun Xiao and Jin Ren",
  title =        "{TRAP}-Array: a Disk Array Architecture Providing
                 Timely Recovery to Any Point-in-time",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "34",
  number =       "2",
  pages =        "289--301",
  year =         "2006",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Aug 21 15:00:05 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Balakrishnan:2006:PDD,
  author =       "Saisanthosh Balakrishnan and Gurindar S. Sohi",
  title =        "Program Demultiplexing: Data-flow based Speculative
                 Parallelization of Methods in Sequential Programs",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "34",
  number =       "2",
  pages =        "302--313",
  year =         "2006",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Aug 21 15:00:05 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Swanson:2006:APT,
  author =       "Steven Swanson and Andrew Putnam and Martha Mercaldi
                 and Ken Michelson and Andrew Petersen and Andrew
                 Schwerin and Mark Oskin and Susan J. Eggers",
  title =        "Area-Performance Trade-offs in Tiled Dataflow
                 Architectures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "34",
  number =       "2",
  pages =        "314--326",
  year =         "2006",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Aug 21 15:00:05 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Strauss:2006:FSA,
  author =       "Karin Strauss and Xiaowei Shen and Josep Torrellas",
  title =        "Flexible Snooping: Adaptive Forwarding and Filtering
                 of Snoops in Embedded-Ring Multiprocessors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "34",
  number =       "2",
  pages =        "327--338",
  year =         "2006",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Aug 21 15:00:05 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Cheng:2006:IAC,
  author =       "Liqun Cheng and Naveen Muralimanohar and Karthik
                 Ramani and Rajeev Balasubramonian and John B. Carter",
  title =        "Interconnect-Aware Coherence Protocols for Chip
                 Multiprocessors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "34",
  number =       "2",
  pages =        "339--351",
  year =         "2006",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Aug 21 15:00:05 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Herrod:2006:FVT,
  author =       "Steve Herrod",
  title =        "The Future of Virtualization Technology",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "34",
  number =       "2",
  pages =        "352--352",
  year =         "2006",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Aug 21 15:00:05 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{VanMeter:2006:DAQ,
  author =       "Rodney {Van Meter} and Kae Nemoto and W. J. Munro and
                 Kohei M. Itoh",
  title =        "Distributed Arithmetic on a Quantum Multicomputer",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "34",
  number =       "2",
  pages =        "354--365",
  year =         "2006",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Aug 21 15:00:05 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Isailovic:2006:INS,
  author =       "Nemanja Isailovic and Yatish Patel and Mark Whitney
                 and John Kubiatowicz",
  title =        "Interconnection Networks for Scalable Quantum
                 Computers",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "34",
  number =       "2",
  pages =        "366--377",
  year =         "2006",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Aug 21 15:00:05 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Thaker:2006:QMH,
  author =       "Darshan D. Thaker and Tzvetan S. Metodi and Andrew W.
                 Cross and Isaac L. Chuang and Frederic T. Chong",
  title =        "Quantum Memory Hierarchies: Efficient Designs to Match
                 Available Parallelism in Quantum Computing",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "34",
  number =       "2",
  pages =        "378--390",
  year =         "2006",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Aug 21 15:00:05 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Anonymous:2006:AI,
  author =       "Anonymous",
  title =        "Author Index",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "34",
  number =       "2",
  pages =        "391--391",
  year =         "2006",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Aug 21 15:00:05 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Burtscher:2006:TTA,
  author =       "Martin Burtscher",
  title =        "{TCgen 2.0}: a tool to automatically generate lossless
                 trace compressors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "34",
  number =       "3",
  pages =        "1--8",
  month =        jun,
  year =         "2006",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Sep 4 12:39:50 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kumar:2006:LLB,
  author =       "Abhas Kumar and Nisheet Jain and Mainak Chaudhuri",
  title =        "Long-latency branches: how much do they matter?",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "34",
  number =       "3",
  pages =        "9--15",
  month =        jun,
  year =         "2006",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Sep 4 12:39:50 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Thorson:2006:INb,
  author =       "Mark Thorson",
  title =        "{Internet} nuggets",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "34",
  number =       "3",
  pages =        "16--21",
  month =        jun,
  year =         "2006",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Sep 4 12:39:50 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Henning:2006:SCB,
  author =       "John L. Henning",
  title =        "{SPEC CPU2006} benchmark descriptions",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "34",
  number =       "4",
  pages =        "1--17",
  month =        sep,
  year =         "2006",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1186736.1186737",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 12:07:09 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "On August 24, 2006, the Standard Performance
                 Evaluation Corporation (SPEC) announced CPU2006 [2],
                 which replaces CPU2000. The SPEC CPU benchmarks are
                 widely used in both industry and academia [3].",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Citron:2006:HGM,
  author =       "Daniel Citron and Adham Hurani and Alaa Gnadrey",
  title =        "The harmonic or geometric mean: does it really
                 matter?",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "34",
  number =       "4",
  pages =        "18--25",
  month =        sep,
  year =         "2006",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1186736.1186738",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 12:07:09 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "For several decades, computer scientists have been
                 arguing which mean is more appropriate for summarizing
                 computer performance: the harmonic or the geometric. We
                 show that many test cases used in the past to discredit
                 one mean or the other are either artificial or
                 incidental. Changing only one of the benchmarks may
                 result in totally different conclusions. In addition,
                 we conclude that for the SPEC CPU2000 benchmark suite,
                 the choice of averaging has very little influence on
                 the relative standing of different machines. Therefore,
                 the decision to purchase one system rather then another
                 should not be influenced by the type of averaging
                 used.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Poe:2006:BBS,
  author =       "James Poe and Tao Li",
  title =        "{BASS}: a benchmark suite for evaluating architectural
                 security systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "34",
  number =       "4",
  pages =        "26--33",
  month =        sep,
  year =         "2006",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1186736.1186739",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 12:07:09 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "As software vulnerabilities continue to be exposed on
                 a daily basis and the motivation of cunning adversaries
                 to compromise valuable computer assets grows, novel
                 methods must be developed to ensure security. Recently
                 there has been a growing interest within the computer
                 architecture research community in designing
                 architectural and hardware mechanisms to improve
                 security. Unfortunately, there is currently not a
                 representative set of benchmarks for evaluating the
                 security features of proposed hardware modifications.
                 The frequent result is that great effort is often spent
                 searching for vulnerable programs, and/or evaluations
                 suffer from a lack of diversity. To address this
                 problem, we developed BASS, a benchmark suite to
                 evaluate the security features of proposed
                 architectural solutions under various malicious attack
                 scenarios. BASS v 1.0 currently consists of seven
                 benchmarks chosen to cover a diverse range of
                 architectural attack characteristics. To facilitate the
                 use of these benchmarks in architectural security
                 research, we have developed both vulnerable programs
                 and scripts to automatically generate exploits
                 targeting those vulnerable programs across both 32-bit
                 x86 and 64-bit Alpha Linux platforms. The entire BASS
                 framework including documentation, source code, input
                 data sets, and precompiled binaries for the M5 full
                 system simulator is released under the Gnu GPL and can
                 be freely downloaded at
                 http://www.ideal.ece.ufl.edu/bass.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Thorson:2006:IN,
  author =       "Mark Thorson",
  title =        "{Internet} nuggets",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "34",
  number =       "4",
  pages =        "34--37",
  month =        sep,
  year =         "2006",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1186736.1186741",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 12:07:09 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "This column consists of selected traffic from the
                 comp.arch newsgroup, a forum for discussion of computer
                 architecture on the Internet---an international
                 computer network. As always, the opinions expressed in
                 this column are the personal views of the authors, and
                 do not necessarily represent the institutions to which
                 they are affiliated. Text which sets the context of a
                 message appears underlined or in italics; this is
                 usually text the author has quoted from earlier
                 messages. The code-like expressions below the authors'
                 names are their addresses on Internet.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Rosenblum:2006:IVC,
  author =       "Mendel Rosenblum",
  title =        "Impact of virtualization on computer architecture and
                 operating systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "34",
  number =       "5",
  pages =        "1--1",
  month =        dec,
  year =         "2006",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Oct 27 06:18:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Adams:2006:CSH,
  author =       "Keith Adams and Ole Agesen",
  title =        "A comparison of software and hardware techniques for
                 {x86} virtualization",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "34",
  number =       "5",
  pages =        "2--13",
  month =        dec,
  year =         "2006",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Oct 27 06:18:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Jones:2006:GMB,
  author =       "Stephen T. Jones and Andrea C. Arpaci-Dusseau and
                 Remzi H. Arpaci-Dusseau",
  title =        "{Geiger}: monitoring the buffer cache in a virtual
                 machine environment",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "34",
  number =       "5",
  pages =        "14--24",
  month =        dec,
  year =         "2006",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Oct 27 06:18:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Crandall:2006:TSD,
  author =       "Jedidiah R. Crandall and Gary Wassermann and Daniela
                 A. S. de Oliveira and Zhendong Su and S. Felix Wu and
                 Frederic T. Chong",
  title =        "Temporal search: detecting hidden malware timebombs
                 with virtual machines",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "34",
  number =       "5",
  pages =        "25--36",
  month =        dec,
  year =         "2006",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Oct 27 06:18:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Lu:2006:ADA,
  author =       "Shan Lu and Joseph Tucek and Feng Qin and Yuanyuan
                 Zhou",
  title =        "{AVIO}: detecting atomicity violations via access
                 interleaving invariants",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "34",
  number =       "5",
  pages =        "37--48",
  month =        dec,
  year =         "2006",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Oct 27 06:18:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Xu:2006:RTR,
  author =       "Min Xu and Mark D. Hill and Rastislav Bodik",
  title =        "A regulated transitive reduction ({RTR}) for longer
                 memory race recording",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "34",
  number =       "5",
  pages =        "49--60",
  month =        dec,
  year =         "2006",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Oct 27 06:18:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Bond:2006:BBE,
  author =       "Michael D. Bond and Kathryn S. McKinley",
  title =        "{Bell}: bit-encoding online memory leak detection",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "34",
  number =       "5",
  pages =        "61--72",
  month =        dec,
  year =         "2006",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Oct 27 06:18:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Shyam:2006:ULC,
  author =       "Smitha Shyam and Kypros Constantinides and Sujay
                 Phadke and Valeria Bertacco and Todd Austin",
  title =        "Ultra low-cost defect protection for microprocessor
                 pipelines",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "34",
  number =       "5",
  pages =        "73--82",
  month =        dec,
  year =         "2006",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Oct 27 06:18:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Reddy:2006:UPB,
  author =       "Vimal K. Reddy and Eric Rotenberg and Sailashri
                 Parthasarathy",
  title =        "Understanding prediction-based partial redundant
                 threading for low-overhead, high-coverage fault
                 tolerance",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "34",
  number =       "5",
  pages =        "83--94",
  month =        dec,
  year =         "2006",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Oct 27 06:18:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Parashar:2006:SSB,
  author =       "Angshuman Parashar and Anand Sivasubramaniam and
                 Sudhanva Gurumurthi",
  title =        "{SlicK}: slice-based locality exploitation for
                 efficient redundant multithreading",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "34",
  number =       "5",
  pages =        "95--105",
  month =        dec,
  year =         "2006",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Oct 27 06:18:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Heath:2006:MFT,
  author =       "Taliver Heath and Ana Paula Centeno and Pradeep George
                 and Luiz Ramos and Yogesh Jaluria",
  title =        "{Mercury} and {Freon}: temperature emulation and
                 management for server systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "34",
  number =       "5",
  pages =        "106--116",
  month =        dec,
  year =         "2006",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Oct 27 06:18:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kgil:2006:PUS,
  author =       "Taeho Kgil and Shaun D'Souza and Ali Saidi and Nathan
                 Binkert and Ronald Dreslinski and Trevor Mudge and
                 Steven Reinhardt and Krisztian Flautner",
  title =        "{PicoServer}: using {$3$D} stacking technology to
                 enable a compact energy efficient chip multiprocessor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "34",
  number =       "5",
  pages =        "117--128",
  month =        dec,
  year =         "2006",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Oct 27 06:18:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Coons:2006:SPS,
  author =       "Katherine E. Coons and Xia Chen and Doug Burger and
                 Kathryn S. McKinley and Sundeep K. Kushwaha",
  title =        "A spatial path scheduling algorithm for {EDGE}
                 architectures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "34",
  number =       "5",
  pages =        "129--140",
  month =        dec,
  year =         "2006",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Oct 27 06:18:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Mercaldi:2006:IST,
  author =       "Martha Mercaldi and Steven Swanson and Andrew Petersen
                 and Andrew Putnam and Andrew Schwerin and Mark Oskin
                 and Susan J. Eggers",
  title =        "Instruction scheduling for a tiled dataflow
                 architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "34",
  number =       "5",
  pages =        "141--150",
  month =        dec,
  year =         "2006",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Oct 27 06:18:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Gordon:2006:ECG,
  author =       "Michael I. Gordon and William Thies and Saman
                 Amarasinghe",
  title =        "Exploiting coarse-grained task, data, and pipeline
                 parallelism in stream programs",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "34",
  number =       "5",
  pages =        "151--162",
  month =        dec,
  year =         "2006",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Oct 27 06:18:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Mishra:2006:TES,
  author =       "Mahim Mishra and Timothy J. Callahan and Tiberiu
                 Chelcea and Girish Venkataramani and Seth C. Goldstein
                 and Mihai Budiu",
  title =        "{Tartan}: evaluating spatial computation for whole
                 program execution",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "34",
  number =       "5",
  pages =        "163--174",
  month =        dec,
  year =         "2006",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Oct 27 06:18:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Eyerman:2006:PCA,
  author =       "Stijn Eyerman and Lieven Eeckhout and Tejas Karkhanis
                 and James E. Smith",
  title =        "A performance counter architecture for computing
                 accurate {CPI} components",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "34",
  number =       "5",
  pages =        "175--184",
  month =        dec,
  year =         "2006",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Oct 27 06:18:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Lee:2006:AER,
  author =       "Benjamin C. Lee and David M. Brooks",
  title =        "Accurate and efficient regression modeling for
                 microarchitectural performance and power prediction",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "34",
  number =       "5",
  pages =        "185--194",
  month =        dec,
  year =         "2006",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Oct 27 06:18:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ipek:2006:EEA,
  author =       "Engin {\"I}pek and Sally A. McKee and Rich Caruana and
                 Bronis R. de Supinski and Martin Schulz",
  title =        "Efficiently exploring architectural design spaces via
                 predictive modeling",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "34",
  number =       "5",
  pages =        "195--206",
  month =        dec,
  year =         "2006",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Oct 27 06:18:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kharbutli:2006:CEP,
  author =       "Mazen Kharbutli and Xiaowei Jiang and Yan Solihin and
                 Guru Venkataramani and Milos Prvulovic",
  title =        "Comprehensively and efficiently protecting the heap",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "34",
  number =       "5",
  pages =        "207--218",
  month =        dec,
  year =         "2006",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Oct 27 06:18:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Chilimbi:2006:HIH,
  author =       "Trishul M. Chilimbi and Vinod Ganapathy",
  title =        "{HeapMD}: identifying heap-based bugs using anomaly
                 detection",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "34",
  number =       "5",
  pages =        "219--228",
  month =        dec,
  year =         "2006",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Oct 27 06:18:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Narayanasamy:2006:RSM,
  author =       "Satish Narayanasamy and Cristiano Pereira and Brad
                 Calder",
  title =        "Recording shared memory dependencies using strata",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "34",
  number =       "5",
  pages =        "229--240",
  month =        dec,
  year =         "2006",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Oct 27 06:18:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Patwardhan:2006:DTS,
  author =       "Jaidev P. Patwardhan and Vijeta Johri and Chris Dwyer
                 and Alvin R. Lebeck",
  title =        "A defect tolerant self-organizing nanoscale {SIMD}
                 architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "34",
  number =       "5",
  pages =        "241--251",
  month =        dec,
  year =         "2006",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Oct 27 06:18:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Schuchman:2006:PTA,
  author =       "Ethan Schuchman and T. N. Vijaykumar",
  title =        "A program transformation and architecture support for
                 quantum uncomputation",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "34",
  number =       "5",
  pages =        "252--263",
  month =        dec,
  year =         "2006",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Oct 27 06:18:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Mysore:2006:IC,
  author =       "Shashidhar Mysore and Banit Agrawal and Navin
                 Srivastava and Sheng-Chih Lin and Kaustav Banerjee and
                 Tim Sherwood",
  title =        "Introspective {$3$D} chips",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "34",
  number =       "5",
  pages =        "264--273",
  month =        dec,
  year =         "2006",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Oct 27 06:18:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Cantin:2006:SP,
  author =       "Jason F. Cantin and Mikko H. Lipasti and James E.
                 Smith",
  title =        "Stealth prefetching",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "34",
  number =       "5",
  pages =        "274--282",
  month =        dec,
  year =         "2006",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Oct 27 06:18:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Chakraborty:2006:CSE,
  author =       "Koushik Chakraborty and Philip M. Wells and Gurindar
                 S. Sohi",
  title =        "Computation spreading: employing hardware migration to
                 specialize {CMP} cores on-the-fly",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "34",
  number =       "5",
  pages =        "283--292",
  month =        dec,
  year =         "2006",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Oct 27 06:18:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Miller:2006:SBI,
  author =       "Jason E. Miller and Anant Agarwal",
  title =        "Software-based instruction caching for embedded
                 processors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "34",
  number =       "5",
  pages =        "293--302",
  month =        dec,
  year =         "2006",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Oct 27 06:18:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Li:2006:MEM,
  author =       "Xin Li and Marian Boldt and Reinhard von Hanxleden",
  title =        "Mapping {Esterel} onto a multi-threaded embedded
                 processor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "34",
  number =       "5",
  pages =        "303--314",
  month =        dec,
  year =         "2006",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Oct 27 06:18:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Binkert:2006:INI,
  author =       "Nathan L. Binkert and Ali G. Saidi and Steven K.
                 Reinhardt",
  title =        "Integrated network interfaces for high-bandwidth
                 {TCP\slash IP}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "34",
  number =       "5",
  pages =        "315--324",
  month =        dec,
  year =         "2006",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Oct 27 06:18:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Tarditi:2006:AUD,
  author =       "David Tarditi and Sidd Puri and Jose Oglesby",
  title =        "{Accelerator}: using data parallelism to program
                 {GPUs} for general-purpose uses",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "34",
  number =       "5",
  pages =        "325--335",
  month =        dec,
  year =         "2006",
  CODEN =        "OSRED8",
  DOI =          "https://doi.org/10.1145/1168857.1168898",
  ISSN =         "0163-5980",
  bibdate =      "Fri Oct 27 06:18:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "GPUs are difficult to program for general-purpose
                 uses. Programmers can either learn graphics APIs and
                 convert their applications to use graphics pipeline
                 operations or they can use stream programming
                 abstractions of GPUs. We describe Accelerator, a system
                 that uses data parallelism to program GPUs for
                 general-purpose uses instead. Programmers use a
                 conventional imperative programming language and a
                 library that provides only high-level data-parallel
                 operations. No aspects of GPUs are exposed to
                 programmers. The library implementation compiles the
                 data-parallel operations on the fly to optimized GPU
                 pixel shader code and API calls. We describe the
                 compilation techniques used to do this. We evaluate the
                 effectiveness of using data parallelism to program GPUs
                 by providing results for a set of compute-intensive
                 benchmarks. We compare the performance of Accelerator
                 versions of the benchmarks against hand-written pixel
                 shaders. The speeds of the Accelerator versions are
                 typically within 50\% of the speeds of hand-written
                 pixel shader code. Some benchmarks significantly
                 outperform C versions on a CPU: they are up to 18 times
                 faster than C code running on a CPU.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Damron:2006:HTM,
  author =       "Peter Damron and Alexandra Fedorova and Yossi Lev",
  title =        "Hybrid transactional memory",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "34",
  number =       "5",
  pages =        "336--346",
  month =        dec,
  year =         "2006",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Oct 27 06:18:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Chuang:2006:UPB,
  author =       "Weihaw Chuang and Satish Narayanasamy and Ganesh
                 Venkatesh and Jack Sampson and Michael {Van Biesbrouck}
                 and Gilles Pokam and Brad Calder and Osvaldo Colavin",
  title =        "Unbounded page-based transactional memory",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "34",
  number =       "5",
  pages =        "347--358",
  month =        dec,
  year =         "2006",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Oct 27 06:18:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Moravan:2006:SNT,
  author =       "Michelle J. Moravan and Jayaram Bobba and Kevin E.
                 Moore and Luke Yen and Mark D. Hill and Ben Liblit and
                 Michael M. Swift and David A. Wood",
  title =        "Supporting nested transactional memory in {logTM}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "34",
  number =       "5",
  pages =        "359--370",
  month =        dec,
  year =         "2006",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Oct 27 06:18:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Chung:2006:TTM,
  author =       "JaeWoong Chung and Chi Cao Minh and Austen McDonald
                 and Travis Skare and Hassan Chafi and Brian D.
                 Carlstrom and Christos Kozyrakis and Kunle Olukotun",
  title =        "Tradeoffs in transactional memory virtualization",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "34",
  number =       "5",
  pages =        "371--381",
  month =        dec,
  year =         "2006",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Oct 27 06:18:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kawahito:2006:NIR,
  author =       "Motohiro Kawahito and Hideaki Komatsu and Takao
                 Moriyama and Hiroshi Inoue and Toshio Nakatani",
  title =        "A new idiom recognition framework for exploiting
                 hardware-assist instructions",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "34",
  number =       "5",
  pages =        "382--393",
  month =        dec,
  year =         "2006",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Oct 27 06:18:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Bansal:2006:AGP,
  author =       "Sorav Bansal and Alex Aiken",
  title =        "Automatic generation of peephole superoptimizers",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "34",
  number =       "5",
  pages =        "394--403",
  month =        dec,
  year =         "2006",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Oct 27 06:18:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Solar-Lezama:2006:CSF,
  author =       "Armando Solar-Lezama and Liviu Tancau and Rastislav
                 Bodik and Sanjit Seshia and Vijay Saraswat",
  title =        "Combinatorial sketching for finite programs",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "34",
  number =       "5",
  pages =        "404--415",
  month =        dec,
  year =         "2006",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Oct 27 06:18:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{DaSilva:2006:PPA,
  author =       "Jeff {Da Silva} and J. Gregory Steffan",
  title =        "A probabilistic pointer analysis for speculative
                 optimizations",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "34",
  number =       "5",
  pages =        "416--425",
  month =        dec,
  year =         "2006",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Oct 27 06:18:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Tullsen:2007:ISI,
  author =       "Dean Tullsen and Rakesh Kumar and Norman P. Jouppi",
  title =        "Introduction to the special issue on the {2006
                 Workshop on Design, Analysis, and Simulation of Chip
                 Multiprocessors: (dasCMP'06)}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "1",
  pages =        "2--2",
  month =        mar,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1241601.1241605",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:47:26 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Chip multiprocessor architectures are becoming
                 increasingly attractive as an option to provide high
                 instruction throughput while keeping power and
                 complexity under control. Such architectures have also
                 been shown to have scalability and productivity
                 advantages. Multi-core processors are fast becoming
                 mainstream.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "{DASCMP'06}",
}

@Article{Mahesri:2007:HSS,
  author =       "Aqeel Mahesri and Nicholas J. Wang and Sanjay J.
                 Patel",
  title =        "Hardware support for software controlled
                 multithreading",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "1",
  pages =        "3--12",
  month =        mar,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1241601.1241606",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:47:26 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Chip multi-processors have emerged as one of the most
                 effective uses of the huge number of transistors
                 available today and in the future, but questions remain
                 as to the best way to leverage CMPs to accelerate
                 single threaded applications. Previous approaches rely
                 on significant speculation to accomplish this goal. Our
                 proposal, NXA, is less speculative than previous
                 proposals, relying heavily on software to guarantee
                 thread correctness, though still allowing parallelism
                 in the presence of ambiguous dependences. It divides a
                 single thread of execution into multiple using the
                 master-worker paradigm where some set of master threads
                 execute code that spawns tasks for other, worker
                 threads. The master threads generally consist of
                 performance critical instructions that can prefetch
                 data, compute critical control decisions, or compute
                 performance critical dataflow slices. This prevents
                 non-critical instructions from competing with critical
                 instructions for processor resources, allowing the
                 critical thread (and thus the workload) to complete
                 faster. Empirical results from performance simulation
                 show a 20\% improvement in performance on a 2-way CMP
                 machine, demonstrating that software controlled
                 multithreading can indeed provide a benefit in the
                 presence of hardware support.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "{DASCMP'06}",
}

@Article{Shi:2007:CCP,
  author =       "Xudong Shi and Feiqi Su and Jih-kwon Peir and Ye Xia
                 and Zhen Yang",
  title =        "{CMP} cache performance projection: accessibility vs.
                 capacity",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "1",
  pages =        "13--20",
  month =        mar,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1241601.1241607",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:47:26 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Efficient utilizing on-chip storage space on
                 Chip-Multiprocessors (CMPs) has become an important
                 research topic. Tradeoffs between data accessibility
                 and effective on-chip capacity have been studied
                 extensively. It requires costly simulations to
                 understand a wide-spectrum of the design space. In this
                 paper, we first develop an abstract model for
                 understanding the performance impact with respect to
                 data replication. To overcome the lack of real-time
                 interactions among multiple cores in the abstract
                 model, we propose a global stack simulation strategy to
                 study the performance of a variety of cache
                 organizations on CMPs. The global stack logically
                 incorporates a shared stack and per-core private stacks
                 to collect shared/private reuse (stack) distances for
                 every memory reference in a single simulation pass.
                 With the collected reuse distances, performance in
                 terms of hits/misses and average memory access times
                 can be calculated for various cache organizations. We
                 verify the stack results against individual
                 execution-driven simulations that consider realistic
                 cache parameters and delays using a set of commercial
                 multithreaded workloads. The results show that stack
                 simulations can accurately model the performance of
                 various cache organizations. The single-pass stack
                 simulation results demonstrate that the effectiveness
                 of various techniques for optimizing the CMP on-chip
                 storage is closely related to the working sets of the
                 workloads as well as to the total cache sizes.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "CMP caches; data replication; performance modeling and
                 projection; stack simulation",
  remark =       "{DASCMP'06}",
}

@Article{Guo:2007:CQC,
  author =       "Fei Guo and Hari Kannan and Li Zhao and Ramesh
                 Illikkal and Ravi Iyer and Don Newell and Yan Solihin
                 and Christos Kozyrakis",
  title =        "From chaos to {QoS}: case studies in {CMP} resource
                 management",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "1",
  pages =        "21--30",
  month =        mar,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1241601.1241608",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:47:26 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "As more and more cores are enabled on the die of
                 future CMP platforms, we expect that several diverse
                 workloads will run simultaneously on the platform. A
                 key example of this trend is the growth of
                 virtualization usage models. When multiple virtual
                 machines or applications or threads run simultaneously,
                 the quality of service (QoS) that the platform provides
                 to each individual thread is non-deterministic today.
                 This occurs because the simultaneously running threads
                 place very different demands on the shared resources
                 (cache space, memory bandwidth, etc) in the platform
                 and in most cases contend with each other. In this
                 paper, we first present case studies that show how this
                 results in non-deterministic performance. Unlike the
                 compute resources managed through scheduling, platform
                 resource allocation to individual threads cannot be
                 controlled today. In order to provide better
                 determinism and QoS, we then examine resource
                 management mechanisms and present QoS-aware
                 architectures and execution environments. The main
                 contribution of this paper is the architecture
                 feasibility analysis through prototypes that allow
                 experimentation with QoS-Aware execution environments
                 and architectural resources. We describe these QoS
                 prototypes and then present preliminary case studies of
                 multi-tasking and virtualization usage models sharing
                 one critical CMP resource (last-level cache). We then
                 demonstrate how proper management of the cache resource
                 can provide service differentiation and deterministic
                 performance behavior when running disparate workloads
                 in future CMP platforms.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "{DASCMP'06}",
}

@Article{Kondo:2007:IFT,
  author =       "Masaaki Kondo and Hiroshi Sasaki and Hiroshi
                 Nakamura",
  title =        "Improving fairness, throughput and energy-efficiency
                 on a chip multiprocessor through {DVFS}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "1",
  pages =        "31--38",
  month =        mar,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1241601.1241609",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:47:26 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Recently, a single chip multiprocessor (CMP) is
                 becoming an attractive architecture for improving
                 throughput of program execution. In CMPs, multiple
                 processor cores share several hardware resources such
                 as cache memory and memory bus. Therefore, the resource
                 contention significantly degrades performance of each
                 thread and also loses fairness between threads.\par

                 In this paper, we propose a Dynamic Frequency and
                 Voltage Scaling (DVFS) algorithm for improving total
                 instruction throughput, fairness, and energy efficiency
                 of CMPs. The proposed technique periodically observes
                 the utilization ratio of shared resources and controls
                 the frequency and the voltage of each processor core
                 individually to balance the ratio between threads. We
                 evaluate our technique and the evaluation results show
                 that fairness between threads are greatly improved by
                 the technique. Moreover, the total instruction
                 throughput increases in many cases while reducing
                 energy consumption.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "{DASCMP'06}",
}

@Article{Waliullah:2007:SFC,
  author =       "M. M. Waliullah and Per Stenstrom",
  title =        "Starvation-free commit arbitration policies for
                 transactional memory systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "1",
  pages =        "39--46",
  month =        mar,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1241601.1241610",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:47:26 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "In transactional memory systems like TCC, unordered
                 transactions are committed on a first-come, first-serve
                 basis. If a transaction has read data that has been
                 modified by the next transaction to commit, it will
                 have to roll-back and a lot of computations can
                 potentially be wasted. Even worse, such simple commit
                 arbitration policies are prone to starvation; in fact,
                 the performance of Raytrace in SPLASH-2 suffered
                 significantly because of this.\par

                 This paper analyzes in detail the design issues for
                 commit arbitration policies and proposes novel policies
                 that reduce the amount of wasted computation due to
                 roll-back and, most importantly, avoid starvation. We
                 analyze in detail how to incorporate them in a TCC-like
                 transactional memory protocol. We find that our
                 proposed schemes have no impact on the common-case
                 performance. In addition, they add modest complexity to
                 the baseline protocol.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "{DASCMP'06}",
}

@Article{Ferri:2007:HSF,
  author =       "Cesare Ferri and Tali Moreshet and R. Iris Bahar and
                 Luca Benini and Maurice Herlihy",
  title =        "A hardware\slash software framework for supporting
                 transactional memory in a {MPSoC} environment",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "1",
  pages =        "47--54",
  month =        mar,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1241601.1241611",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:47:26 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Manufacturers are focusing on
                 multiprocessor-system-on-a-chip (MPSoC) architectures
                 in order to provide increased concurrency, rather than
                 increased clock speed, for both large-scale as well as
                 embedded systems. Traditionally lock-based
                 synchronization is provided to support concurrency;
                 however, managing locks can be very difficult and error
                 prone. In addition, the performance and power cost of
                 lock-based synchronization can be high. Transactional
                 memories have been extensively investigated as an
                 alternative to lock-based synchronization in
                 general-purpose systems. It has been shown that
                 transactional memory has advantages over locks in terms
                 of ease of programming, performance and energy
                 consumption. However, their applicability to embedded
                 multi-core platforms has not been explored yet. In this
                 paper, we demonstrate a complete hardware transactional
                 memory solution for an embedded multi-core
                 architecture, consisting of a cache-coherent ARM-based
                 cluster, similar to ARM's MPCore. Using cycle accurate
                 power and performance models for the transactional
                 memory hardware, we evaluate our architectural
                 framework over a set of different system and
                 application settings, and show that transactional
                 memory is a promising solution, even for
                 resource-constrained embedded multiprocessors.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "{DASCMP'06}",
}

@Article{Rul:2007:FLP,
  author =       "Sean Rul and Hans Vandierendonck and Koen {De
                 Bosschere}",
  title =        "Function level parallelism driven by data
                 dependencies",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "1",
  pages =        "55--62",
  month =        mar,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1241601.1241612",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:47:26 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "With the rise of Chip multiprocessors (CMPs), the
                 amount of parallel computing power will increase
                 significantly in the near future. However, most
                 programs are sequential in nature and have not been
                 explicitly parallelized, so they cannot exploit these
                 parallel resources. Automatic parallelization of
                 sequential, non-regular codes is very hard, as
                 illustrated by the lack of solutions after more than 30
                 years of research on the topic. The question remains if
                 there is parallelism in sequential programs that can be
                 detected automatically and if so, how much parallelism
                 there is.\par

                 In this paper, we propose a framework for extracting
                 potential parallelism from programs. Applying this
                 framework to sequential programs can teach us how much
                 parallelism is present in a program, but also tells us
                 what the most appropriate parallel construct for a
                 program is, e.g. a pipeline, master/slave work
                 distribution, etc.\par

                 Our framework is profile-based, implying that it is not
                 safe. It builds two new graph representations of the
                 profile-data: the interprocedural data flow graph and
                 the data sharing graph. This graphs show the data-flow
                 between functions and the data structures facilitating
                 this data-flow, respectively.\par

                 We apply our framework on the SPECcpu2000 bzip2
                 benchmark, achieving a speedup of 3.74 of the
                 compression part and a global speedup of 2.45 on a quad
                 processor system.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "{DASCMP'06}",
}

@Article{Henning:2007:GEI,
  author =       "John L. Henning",
  title =        "{Guest editor}'s introduction",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "1",
  pages =        "63--64",
  month =        mar,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1241601.1241614",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:47:26 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "During the development of the new benchmark suite
                 CPU2006, SPEC analyzed benchmark candidates for various
                 technical attributes, including time profiles, language
                 standard compliance, I/O activity, system resource
                 usage, and many other attributes. Many people
                 contributed to the analysis, as shown in the credits at
                 www.spec.org/cpu2006/docs/credits.html. This issue of
                 Computer Architecture News presents a set of articles
                 flowing from that analysis effort.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Henning:2007:SCS,
  author =       "John L. Henning",
  title =        "{SPEC CPU} suite growth: an historical perspective",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "1",
  pages =        "65--68",
  month =        mar,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1241601.1241615",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:47:26 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Since 1989, the SPEC CPU benchmarks have aspired to
                 ambitious goals: fair, portable, comparable tests using
                 the compute-intensive portion of real applications. It
                 may be difficult today to remember just how much of a
                 challenge these goals presented when SPEC was first
                 founded, or how much of a break they were from previous
                 industry practice.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Phansalkar:2007:SSC,
  author =       "Aashish Phansalkar and Ajay Joshi and Lizy K. John",
  title =        "Subsetting the {SPEC CPU2006} benchmark suite",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "1",
  pages =        "69--76",
  month =        mar,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1241601.1241616",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:47:26 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "On August 24, 2006, the Standard Performance
                 Evaluation Corporation (SPEC) announced CPU2006 -- the
                 next generation of industry-standardized CPU-intensive
                 benchmark suite. The SPEC CPU benchmark suite has
                 become the most frequently used suite for
                 simulation-based computer architecture research.
                 Detailed processor simulators take days to weeks to
                 simulate each of the SPEC CPU programs. In order to
                 reduce simulation to a tractable time, architects and
                 researchers often use only a subset of benchmarks from
                 the SPEC CPU suite to evaluate the potential of their
                 ideas. Prior research has demonstrated that statistical
                 techniques are most effective to find a representative
                 subset of benchmark programs from a benchmark suite.
                 The objective of this paper is to apply multivariate
                 statistical data analysis techniques for selecting a
                 representative subset of programs from the SPEC CPU2006
                 benchmark suite. We measure a set of performance
                 counter based characteristics for the SPEC CPU2006
                 programs across a large number of architectures and
                 apply multivariate statistical analysis techniques to
                 find a representative subset of benchmarks and
                 representative input sets wherever multiple input sets
                 are provided. The results from this paper will help
                 architects and researchers to find a smaller but
                 representative set of programs from the SPEC CPU2006
                 benchmark suite, when time or resource constraints
                 prohibit experimentation with the entire benchmark
                 suite.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Wong:2007:CBS,
  author =       "Michael Wong",
  title =        "{C++} benchmarks in {SPEC CPU2006}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "1",
  pages =        "77--83",
  month =        mar,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1241601.1241617",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:47:26 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "In SPEC CPU2006, there are three C++ integer
                 benchmarks and four floating-point C++ benchmarks. This
                 paper describes the work of incorporating C++
                 benchmarks into SPEC CPU2006. It describes the base
                 language standard supported and the basis for run rules
                 adopted to maintain an even playing field for different
                 compilers. It also describes issues that complicate
                 porting C++ benchmarks. It describes some of the C++
                 Standard compliance issues that were technically
                 interesting during the benchmark development phase,
                 using as examples the behavior of const-correctness,
                 nested class access of private member of enclosing
                 class, and unneeded template instantiations.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Henning:2007:SCM,
  author =       "John L. Henning",
  title =        "{SPEC CPU2006} memory footprint",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "1",
  pages =        "84--89",
  month =        mar,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1241601.1241618",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:47:26 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "The nominal goal for memory consumption by SPEC
                 CPU2006 benchmarks is up to about 900 MB when compiled
                 with 32-bit pointers. The 900 MB maximum was chosen so
                 that a system with 1GB will have about 100MB available
                 for the operating system and overhead processes. By
                 comparison, the goal for SPEC CPU2000 was 200MB [1].",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Gove:2007:CWS,
  author =       "Darryl Gove",
  title =        "{CPU2006} working set size",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "1",
  pages =        "90--96",
  month =        mar,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1241601.1241619",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:47:26 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "SPEC CPU2000 had a target memory footprint of 200 MB
                 for the benchmarks [1], to enable the suite to run on
                 machines with 256 MB of memory. Six years have elapsed
                 since the release of that suite, and in that time
                 memory sizes have increased significantly, so the
                 memory requirements for the recently released CPU2006
                 reflect this. CPU2006 has been targeted to have a
                 benchmark memory footprint of about 900MB, allowing the
                 suite to run on machines with 1GB of memory.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Korn:2007:SCS,
  author =       "Wendy Korn and Moon S. Chang",
  title =        "{SPEC CPU2006} sensitivity to memory page sizes",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "1",
  pages =        "97--101",
  month =        mar,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1241601.1241620",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:47:26 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "SPEC CPU2006 is a compute-intensive industry standard
                 benchmark suite published in August 2006. This paper
                 characterizes the memory access behavior of SPEC
                 CPU2006 running on IBM POWER5+ microprocessors. We
                 measure the maximum and average memory usage of the
                 benchmarks to validate SPEC's memory requirement
                 criteria. This paper also analyzes how different page
                 sizes affect the performance of the benchmarks. The
                 experiment reveals that 64 KB and 16 MB pages improve
                 the performance up to 46.9\% and 50.9\%,
                 respectively.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "CPI analysis; large page size; memory usage;
                 performance optimization; SPEC CPU2006 benchmarks;
                 workload characterization",
}

@Article{Weicker:2007:SPR,
  author =       "Reinhold P. Weicker and John L. Henning",
  title =        "Subroutine profiling results for the {CPU2006}
                 benchmarks",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "1",
  pages =        "102--111",
  month =        mar,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1241601.1241621",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:47:26 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Subroutine profiling is a well-known performance tool.
                 For application or system programmers, it determines
                 'hot spots' where the program spends most of its time,
                 and where careful rewriting can most help performance.
                 For compiler authors, it can give information about
                 programming style in such hot spots, and can indicate
                 where compiler improvements may be useful. For hardware
                 designers and analysts, it can be the starting point to
                 explain performance behavior.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ye:2007:CFA,
  author =       "Dong Ye and Joydeep Ray and David Kaeli",
  title =        "Characterization of file {I/O} activity for {SPEC
                 CPU2006}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "1",
  pages =        "112--117",
  month =        mar,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1241601.1241622",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:47:26 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "SPEC CPU2006 is a compute-intensive benchmark suite
                 designed to stress a computer system's processor,
                 memory subsystem, and compiler. To construct this
                 suite, SPEC has selected benchmarks that are derived
                 from real world applications. When run with their
                 reference inputs, these programs place a significant
                 computational burden on today's mainstream desktops as
                 well as high-end workstations and servers.\par

                 For these applications to thoroughly exercise the
                 merits of a particular processor/memory design point,
                 it is necessary to limit the amount of I/O activity
                 generated. Since these applications come from real
                 world applications, the suite developers have
                 considered how best to limit the amount of file-based
                 I/O activity present in these applications. This paper
                 presents the characteristics of file I/O activity in
                 the resulting suite and its overall impact on the
                 performance of these applications. We also report on
                 some of the choices SPEC has made in order to reduce
                 the file I/O activity in some specific programs of the
                 suite.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Henning:2007:PCD,
  author =       "John L. Henning",
  title =        "Performance counters and development of {SPEC
                 CPU2006}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "1",
  pages =        "118--121",
  month =        mar,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1241601.1241623",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:47:26 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Performance counters provide the means to track
                 detailed events that occur on a CPU chip. These events
                 are of interest to both performance analysts and
                 compiler developers. Counting them provides essential
                 clues to guide performance improvement. For example, a
                 tester who sees that a program has a high cache miss
                 rate on a particular system may experiment with
                 compilation options that improve prefetching. A
                 compiler developer who sees the same thing may realize
                 that the code generator's machine model is missing some
                 crucial detail of behavior on that particular system.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Gove:2007:ECB,
  author =       "Darryl Gove and Lawrence Spracklen",
  title =        "Evaluating the correspondence between training and
                 reference workloads in {SPEC CPU2006}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "1",
  pages =        "122--129",
  month =        mar,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1241601.1241624",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:47:26 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Profile feedback (sometimes called Feedback Directed
                 Optimisation FDO) is a useful technique for providing
                 the compiler with additional information about runtime
                 program flow. The compiler is able to use this
                 information to make optimisation decisions that improve
                 the way the code is laid out in memory or determine
                 which routines are inlined, and hence improve the
                 performance of the application.\par

                 The use of profile feedback requires the code to be
                 compiled twice. The first time the compiler generates
                 an instrumented version of the application. This
                 instrumented version is then run on one or more
                 'representative' training workloads to gather profile
                 data. This profile data contains information such as
                 how many times each routine is executed and how
                 frequently each branch is taken. The second pass
                 through the compiler uses this information to make more
                 enlightened optimisation decisions.\par

                 The quality of the training data impacts the ability of
                 the compiler to do the best job that it can. This paper
                 discusses a method of assessing the similarity of the
                 training workload to the reference workload, and
                 applies this methodology to evaluate the training
                 workloads in the SPEC CPU2006 benchmark suite.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Spradling:2007:SCB,
  author =       "Cloyce D. Spradling",
  title =        "{SPEC CPU2006} benchmark tools",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "1",
  pages =        "130--134",
  month =        mar,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1241601.1241625",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:47:26 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "The benchmarks that make up the SPEC CPU2006 benchmark
                 suite are set-up, run, timed, and scored by the CPU
                 tools harness. The tools have evolved over time from a
                 collection of edit-it-yourself makefiles, scripts, and
                 an Excel spreadsheet to the current Perl-based suite.
                 The basic purpose of the tools is to make life easier
                 for the benchmarker; they make it easier to tweak
                 compilation settings, easier to keep track of those
                 settings, and most importantly, they make it easier to
                 follow the run and reporting rules.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Sridhar:2007:HLO,
  author =       "Swaroop Sridhar and Jonathan S. Shapiro and Prashanth
                 P. Bungale",
  title =        "{HDTrans}: a low-overhead dynamic translator",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "1",
  pages =        "135--140",
  month =        mar,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1241601.1241602",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:47:26 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Dynamic translation is a general purpose tool used for
                 instrumenting programs at run time. Many current
                 translators perform substantial rewriting during
                 translation in an attempt to reduce execution time.
                 When dynamic translation is used as a ubiquitous policy
                 enforcement mechanism, the majority of program
                 executions have no dominating inner loop that can be
                 used to amortize the cost of translation. Even under
                 more favorable usage assumptions, our measurements show
                 that such optimizations offer no significant benefit in
                 most cases. A simpler, more maintainable, adaptable,
                 and smaller translator may be preferable to more
                 complicated designs.\par

                 In this paper, we present HDTrans, a light-weight IA-32
                 to IA-32 binary translation system that uses some
                 simple and effective translation techniques in
                 combination with established trace linearization and
                 code caching optimizations. We also present an
                 evaluation of translation overhead under non-ideal
                 conditions, showing that conventional benchmarks do not
                 provide a good prediction of translation overhead when
                 used pervasively.\par

                 A further contribution of this paper is an analysis of
                 the effectiveness of post-compile static
                 pre-translation techniques for overhead reduction. Our
                 results indicate that static pre-translation is
                 effective only when expensive instrumentation or
                 optimization is performed, and that efficient reload of
                 pre-translated code incurs a substantial execution-time
                 penalty.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Yan:2007:HMC,
  author =       "Jun Yan and Wei Zhang",
  title =        "Hybrid multi-core architecture for boosting
                 single-threaded performance",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "1",
  pages =        "141--148",
  month =        mar,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1241601.1241603",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:47:26 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "The scaling of technology and the diminishing return
                 of complicated uniprocessors have driven the industry
                 towards multicore processors. While multithreaded
                 applications can naturally leverage the enhanced
                 throughput of multi-core processors, a large number of
                 important applications are single-threaded, which
                 cannot automatically harness the potential of
                 multi-core processors. In this paper, we propose a
                 compiler-driven heterogeneous multicore architecture,
                 consisting of tightly-integrated VLIW (Very Long
                 Instruction Word) and superscalar processors on a
                 single chip, to automatically boost the performance of
                 single-threaded applications without compromising the
                 capability to support multithreaded programs. In the
                 proposed multi-core architecture, while the
                 high-performance VLIW core is used to run code segments
                 with high instruction-level parallelism (ILP) extracted
                 by the compiler; the superscalar core can be exploited
                 to deal with the runtime events that are typically
                 difficult for the VLIW core to handle, such as L2 cache
                 misses. Our initial experimental results by running the
                 preexecution thread on the superscalar core to mitigate
                 the L2 cache misses of the main thread on the VLIW core
                 indicate that the proposed VLIW/superscalar multi-core
                 processor can automatically improve the performance of
                 single-threaded general-purpose applications by up to
                 40.8\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Thorson:2007:INa,
  author =       "Mark Thorson",
  title =        "{Internet} nuggets",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "1",
  pages =        "149--154",
  month =        mar,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1241601.1241627",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:47:26 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "This column consists of selected traffic from the
                 comp.arch newsgroup, a forum for discussion of computer
                 architecture on the Internet---an international
                 computer network.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Shaw:2007:ASP,
  author =       "David E. Shaw and Martin M. Deneroff and Ron O. Dror
                 and Jeffrey S. Kuskin and Richard H. Larson and John K.
                 Salmon and Cliff Young and Brannon Batson and Kevin J.
                 Bowers and Jack C. Chao and Michael P. Eastwood and
                 Joseph Gagliardo and J. P. Grossman and C. Richard Ho
                 and Douglas J. Ierardi and Istv{\'a}n Kolossv{\'a}ry
                 and John L. Klepeis and Timothy Layman and Christine
                 McLeavey and Mark A. Moraes and Rolf Mueller and Edward
                 C. Priest and Yibing Shan and Jochen Spengler and
                 Michael Theobald and Brian Towles and Stanley C. Wang",
  title =        "{Anton}, a special-purpose machine for molecular
                 dynamics simulation",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "2",
  pages =        "1--12",
  month =        may,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1250662.1250664",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:48:43 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "The ability to perform long, accurate molecular
                 dynamics (MD) simulations involving proteins and other
                 biological macro-molecules could in principle provide
                 answers to some of the most important currently
                 outstanding questions in the fields of biology,
                 chemistry and medicine. A wide range of biologically
                 interesting phenomena, however, occur over time scales
                 on the order of a millisecond--about three orders of
                 magnitude beyond the duration of the longest current MD
                 simulations.\par

                 In this paper, we describe a massively parallel machine
                 called Anton, which should be capable of executing
                 millisecond-scale classical MD simulations of such
                 biomolecular systems. The machine, which is scheduled
                 for completion by the end of 2008, is based on 512
                 identical MD-specific ASICs that interact in a tightly
                 coupled manner using a specialized high-speed
                 communication network. Anton has been designed to use
                 both novel parallel algorithms and special-purpose
                 logic to dramatically accelerate those calculations
                 that dominate the time required for a typical MD
                 simulation. The remainder of the simulation algorithm
                 is executed by a programmable portion of each chip that
                 achieves a substantial degree of parallelism while
                 preserving the flexibility necessary to accommodate
                 anticipated advances in physical models and simulation
                 methods.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "bioinformatics; biomolecular system simulation;
                 computational biology; computational drug design;
                 molecular dynamics; protein folding; protein structure;
                 special-purpose machine",
}

@Article{Fan:2007:PPW,
  author =       "Xiaobo Fan and Wolf-Dietrich Weber and Luiz Andre
                 Barroso",
  title =        "Power provisioning for a warehouse-sized computer",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "2",
  pages =        "13--23",
  month =        may,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1250662.1250665",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:48:43 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Large-scale Internet services require a computing
                 infrastructure that can be appropriately described as a
                 warehouse-sized computing system. The cost of building
                 datacenter facilities capable of delivering a given
                 power capacity to such a computer can rival the
                 recurring energy consumption costs themselves.
                 Therefore, there are strong economic incentives to
                 operate facilities as close as possible to maximum
                 capacity, so that the non-recurring facility costs can
                 be best amortized. That is difficult to achieve in
                 practice because of uncertainties in equipment power
                 ratings and because power consumption tends to vary
                 significantly with the actual computing activity.
                 Effective power provisioning strategies are needed to
                 determine how much computing equipment can be safely
                 and efficiently hosted within a given power
                 budget.\par

                 In this paper we present the aggregate power usage
                 characteristics of large collections of servers (up to
                 15 thousand) for different classes of applications over
                 a period of approximately six months. Those
                 observations allow us to evaluate opportunities for
                 maximizing the use of the deployed power capacity of
                 datacenters, and assess the risks of over-subscribing
                 it. We find that even in well-tuned applications there
                 is a noticeable gap (7 - 16\%)between achieved and
                 theoretical aggregate peak power usage at the cluster
                 level (thousands of servers). The gap grows to almost
                 40\% in whole datacenters. This headroom can be used to
                 deploy additional compute equipment within the same
                 power budget with minimal risk of exceeding it. We use
                 our modeling framework to estimate the potential of
                 power management schemes to reduce peak power and
                 energy usage. We find that the opportunities for power
                 and energy savings are significant, but greater at the
                 cluster-level (thousands of servers) than at the
                 rack-level (tens). Finally we argue that systems need
                 to be power efficient across the activity range, and
                 not only at peak performance levels.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "energy efficiency; power modeling; power
                 provisioning",
}

@Article{Blundell:2007:MFC,
  author =       "Colin Blundell and Joe Devietti and E. Christopher
                 Lewis and Milo M. K. Martin",
  title =        "Making the fast case common and the uncommon case
                 simple in unbounded transactional memory",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "2",
  pages =        "24--34",
  month =        may,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1273440.1250667",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:48:43 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Hardware transactional memory has great potential to
                 simplify the creation of correct and efficient
                 multithreaded programs, allowing programmers to exploit
                 more effectively the soon-to-be-ubiquitous multi-core
                 designs. Several recent proposals have extended the
                 original bounded transactional memory to unbounded
                 transactional memory, a crucial step toward
                 transactions becoming a general-purpose primitive.
                 Unfortunately, supporting the concurrent execution of
                 an unbounded number of unbounded transactions is
                 challenging, and as a result, many proposed
                 implementations are complex.\par

                 This paper explores a different approach. First, we
                 introduce the permissions-only cache to extend the
                 bound at which transactions overflow to allow the fast,
                 bounded case to be used as frequently as possible.
                 Second, we propose OneTM to simplify the implementation
                 of unbounded transactional memory by bounding the
                 concurrency of transactions that overflow the cache.
                 These mechanisms work synergistically to provide a
                 simple and fast unbounded transactional memory
                 system.\par

                 The permissions-only cache efficiently maintains the
                 coherence permissions --- but not data-for blocks read
                 or written transactionally that have been evicted from
                 the processor's caches. By holding coherence
                 permissions for these blocks, the regular cache
                 coherence protocol can be used to detect transactional
                 conflicts using only a few bits of on-chip storage per
                 overflowed cache block. OneTM allows only one
                 overflowed transaction at a time, relying on the
                 permissions-only cache to ensure that overflow is
                 infrequent. We present two implementations. In
                 OneTM-Serialized, an overflowed transaction simply
                 stalls all other threads in the application.\par

                 In OneTM-Concurrent, non-overflowed transactions and
                 non-transactional code can execute concurrently with
                 the overflowed transaction, providing more concurrency
                 while retaining OneTM's core simplifying assumption.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "concurrency; parallel programming; transactional
                 memory; transactions",
}

@Article{Zhu:2007:SSB,
  author =       "Weirong Zhu and Vugranam C. Sreedhar and Ziang Hu and
                 Guang R. Gao",
  title =        "Synchronization state buffer: supporting efficient
                 fine-grain synchronization on many-core architectures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "2",
  pages =        "35--45",
  month =        may,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1250662.1250668",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:48:43 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Efficient fine-grain synchronization is extremely
                 important to effectively harness the computational
                 power of many-core architectures. However, designing
                 and implementing fine-grain synchronization in such
                 architectures presents several challenges, including
                 issues of synchronization induced overhead, storage
                 cost, scalability, and the level of granularity to
                 which synchronization is applicable. This paper
                 proposes the Synchronization State Buffer ( SS B), a
                 scalable architectural design for fine-grain
                 synchronization that efficiently performs
                 synchronizations between concurrent threads. The design
                 of SSB is motivated by the following observation: at
                 any instance during the parallel execution only a small
                 fraction of memory locations are actively participating
                 in synchronization. Based on this observation we
                 present a fine-grain synchronization design that
                 records and manages the states of frequently
                 synchronized data using modest hardware support. We
                 have implemented the SSB design in the context of the
                 160-core IBM Cyclops-64 architecture. Using detailed
                 simulation, we present our experience for a set of
                 benchmarks with different workload characteristics.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "fine-grain synchronization; many-core; SSB",
}

@Article{Marty:2007:VHS,
  author =       "Michael R. Marty and Mark D. Hill",
  title =        "Virtual hierarchies to support server consolidation",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "2",
  pages =        "46--56",
  month =        may,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1250662.1250670",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:48:43 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Server consolidation is becoming an increasingly
                 popular technique to manage and utilize systems. This
                 paper develops CMP memory systems for server
                 consolidation where most sharing occurs within Virtual
                 Machines (VMs). Our memory systems maximize shared
                 memory accesses serviced within a VM, minimize
                 interference among separate VMs, facilitate dynamic
                 reassignment of VMs to processors and memory, and
                 support content-based page sharing among VMs. We begin
                 with a tiled architecture where each of 64 tiles
                 contains a processor, private L1 caches, and an L2
                 bank. First, we reveal why single-level directory
                 designs fail to meet workload consolidation goals.
                 Second, we develop the paper's central idea of imposing
                 a two-level virtual (or logical) coherence hierarchy on
                 a physically flat CMP that harmonizes with VM
                 assignment. Third, we show that the best of our two
                 virtual hierarchy (VH) variants performs 12-58\% better
                 than the best alternative flat directory protocol when
                 consolidating Apache, OLTP, and Zeus commel workloads
                 on our simulated 64-core CMP.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "cache coherence; chip multiprocessors (CMPs); memory
                 hierarchies; multicore; partitioning; server
                 consolidation; virtual machines",
}

@Article{Nesbit:2007:VPC,
  author =       "Kyle J. Nesbit and James Laudon and James E. Smith",
  title =        "Virtual private caches",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "2",
  pages =        "57--68",
  month =        may,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1273440.1250671",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:48:43 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Virtual Private Machines (VPM) provide a framework for
                 Quality of Service (QoS) in CMP-based computer systems.
                 VPMs incorporate microarchitecture mechanisms that
                 allow shares of hardware resources to be allocated to
                 executing threads, thus providing applications with an
                 upper bound on execution time regardless of other
                 thread activity. Virtual Private Caches (VPCs) are an
                 important element of VPMs. VPC hardware consists of two
                 major components: the VPC Arbiter, which manages shared
                 cache bandwidth, and the VPC Capacity Manager, which
                 manages the cache storage. Both the VPC Arbiter and VPC
                 Capacity Manager provide minimum service guarantees
                 that, when combined, achieve QoS for the cache
                 subsystem. Simulation-based evaluation shows that
                 conventional cache bandwidth management policies allow
                 concurrently executing threads to affect each other
                 significantly in an uncontrollable manner. The
                 evaluation targets cache bandwidth because the effects
                 of cache capacity sharing have been studied elsewhere.
                 In contrast with the conventional policies, the VPC
                 Arbiter meets its QoS performance objectives on all
                 workloads studied and over a range of allocated
                 bandwidth levels. The VPC Arbiter's fairness policy,
                 which distributes leftover bandwidth, mitigates the
                 effects of cache preemption latencies, thus ensuring
                 threads a high-degree of performance isolation.
                 Furthermore, the VPC Arbiter eliminates negative
                 bandwidth interference which can improve aggregate
                 throughput and resource utilization.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "chip multiprocessor; performance isolation; quality of
                 service; shared caches; soft real-time",
}

@Article{Minh:2007:EHT,
  author =       "Chi Cao Minh and Martin Trautmann and JaeWoong Chung
                 and Austen McDonald and Nathan Bronson and Jared Casper
                 and Christos Kozyrakis and Kunle Olukotun",
  title =        "An effective hybrid transactional memory system with
                 strong isolation guarantees",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "2",
  pages =        "69--80",
  month =        may,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1250662.1250673",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:48:43 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "We propose signature-accelerated transactional memory
                 (SigTM), a hybrid TM system that reduces the overhead
                 of software transactions. SigTM uses hardware
                 signatures to track the read-set and write-set for
                 pending transactions and perform conflict detection
                 between concurrent threads. All other transactional
                 functionality, including data versioning, is
                 implemented in software. Unlike previously proposed
                 hybrid TM systems, SigTM requires no modifications to
                 the hardware caches, which reduces hardware cost and
                 simplifies support for nested transactions and
                 multithreaded processor cores. SigTM is also the first
                 hybrid TM system to provide strong isolation guarantees
                 between transactional blocks and non-transactional
                 accesses without additional read and write barriers in
                 non-transactional code.\par

                 Using a set of parallel programs that make frequent use
                 of coarse-grain transactions, we show that SigTM
                 accelerates software transactions by 30\% to 280\%. For
                 certain workloads, SigTM can match the performance of a
                 full-featured hardware TM system, while for workloads
                 with large read-sets it can be up to two times slower.
                 Overall, we show that SigTM combines the performance
                 characteristics and strong isolation guarantees of
                 hardware TM implementations with the low cost and
                 flexibility of software TM systems.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "multi-core architectures; parallel programming; strong
                 isolation; transactional memory",
}

@Article{Bobba:2007:PPH,
  author =       "Jayaram Bobba and Kevin E. Moore and Haris Volos and
                 Luke Yen and Mark D. Hill and Michael M. Swift and
                 David A. Wood",
  title =        "Performance pathologies in hardware transactional
                 memory",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "2",
  pages =        "81--91",
  month =        may,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1273440.1250674",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:48:43 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Hardware Transactional Memory (HTM) systems reflect
                 choices from three key design dimensions: conflict
                 detection, version management, and conflict resolution.
                 Previously proposed HTMs represent three points in this
                 design space: lazy conflict detection, lazy version
                 management, committer wins (LL); eager conflict
                 detection, lazy version management, requester wins
                 (EL); and eager conflict detection, eager version
                 management, and requester stalls with conservative
                 deadlock avoidance (EE). To isolate the effects of
                 these high-level design decisions, we develop a common
                 framework that abstracts away differences in cache
                 write policies, interconnects, and ISA to compare these
                 three design points. Not surprisingly, the relative
                 performance of these systems depends on the workload.
                 Under light transactional loads they perform similarly,
                 but under heavy loads they differ by up to 80\%. None
                 of the systems performs best on all of our benchmarks.
                 We identify seven performance pathologies -interactions
                 between workload and system that degrade performance-as
                 the root cause of many performance differences:
                 FriendlyFire, StarvingWriter, SerializedCommit,
                 FutileStall, StarvingElder, RestartConvoy, and
                 DuelingUpgrades. We discuss when and on which systems
                 these pathologies can occur and show that they actually
                 manifest within TM workloads. The insight provided by
                 these pathologies motivated four enhanced systems that
                 often significantly reduce transactional memory
                 overhead. Importantly, by avoiding transaction
                 pathologies, each enhanced system performs well across
                 our suite of benchmarks.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "contention management; hardware; pathology;
                 performance; transactional memory",
}

@Article{Ramadan:2007:MTT,
  author =       "Hany E. Ramadan and Christopher J. Rossbach and Donald
                 E. Porter and Owen S. Hofmann and Aditya Bhandari and
                 Emmett Witchel",
  title =        "{MetaTM\slash TxLinux}: transactional memory for an
                 operating system",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "2",
  pages =        "92--103",
  month =        may,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1250662.1250675",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:48:43 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "This paper quantifies the effect of architectural
                 design decisions on the performance of TxLinux. TxLinux
                 is a Linux kernel modified to use transactions in place
                 of locking primitives in several key subsystems. We run
                 TxLinux on MetaTM, which is a new hardware-transaction
                 memory (HTM) model. MetaTM contains features that
                 enable efficient and correct interrupt handling for an
                 x86-like architecture. Live stack overwrites can
                 corrupt non-transactional stack memory and requires a
                 small change to the transaction register checkpoint
                 hardware to ensure correct operation of the operating
                 system. We also propose stack based early release to
                 reduce spurious conflicts on stack memory between
                 kernel code and interrupt handlers. We use MetaTM to
                 examine the performance sensitivity of individual
                 architectural features. For TxLinux we find that Polka
                 and SizeMatters are effective contention management
                 policies, some form of backoff on transaction
                 contention is vital for performance,and stalling on a
                 transaction conflict reduces transaction restart rates,
                 but does not improve performance. Transaction write
                 sets are small, and performance is insensitive to
                 transaction abort costs but sensitive to commit
                 costs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "MetaTM; OS support; transactional memory; TxLinux",
}

@Article{Shriraman:2007:IHS,
  author =       "Arrvindh Shriraman and Michael F. Spear and Hemayet
                 Hossain and Virendra J. Marathe and Sandhya Dwarkadas
                 and Michael L. Scott",
  title =        "An integrated hardware-software approach to flexible
                 transactional memory",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "2",
  pages =        "104--115",
  month =        may,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1273440.1250676",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:48:43 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "There has been considerable recent interest in both
                 hardware and software transactional memory (TM). We
                 present an intermediate approach, in which hardware
                 serves to accelerate a TM implementation controlled
                 fundamentally by software. Specifically, we describe an
                 alert on update mechanism (AOU) that allows a thread to
                 receive fast, asynchronous notification when
                 previously-identified lines are written by other
                 threads, and a programmable data isolation mechanism
                 (PDI) that allows a thread to hide its speculative
                 writes from other threads, ignoring conflicts, until
                 software decides to make them visible. These mechanisms
                 reduce bookkeeping, validation, and copying overheads
                 without constraining software policy on a host of
                 design decisions.\par

                 We have used AOU and PDI to implement a
                 hardware-accelerated-software transactional memory
                 system we call RTM. We have also used AOU alone to
                 create a simpler 'RTM-Lite'. Across a range of
                 microbenchmarks, RTM outperforms RSTM, a publicly
                 available software transactional memory system, by as
                 much as 8.7x (geometric mean of 3.5x) in single-thread
                 mode. At 16 threads, it outperforms RSTM by as much as
                 5x, with an average speedup of 2x. Performance degrades
                 gracefully when transactions overflow hardware
                 structures. RTM-Lite is slightly faster than RTM for
                 transactions that modify only small objects; full RTM
                 is significantly faster when objects are large. In a
                 strong argument for policy flexibility, we find that
                 the choice between eager (first-access) and lazy
                 (commit-time) conflict detection can lead to
                 significant performance differences in both directions,
                 depending on application characteristics.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "cache coherence; multiprocessors; RSTM; transactional
                 memory",
}

@Article{Abad:2007:RRE,
  author =       "Pablo Abad and Valentin Puente and Jos{\'e} Angel
                 Gregorio and Pablo Prieto",
  title =        "Rotary router: an efficient architecture for {CMP}
                 interconnection networks",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "2",
  pages =        "116--125",
  month =        may,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1250662.1250678",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:48:43 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "The trend towards increasing the number of processor
                 cores and cache capacity in future Chip-Multiprocessors
                 (CMPs), will require scalable packet-switched
                 interconnection networks adapted to the restrictions
                 imposed by the CMP environment. This paper presents an
                 innovative router design, which successfully addresses
                 CMP cost/performance constraints. The router structure
                 is based on two independent rings, which force packets
                 to circulate either clockwise or anti-clockwise,
                 traveling through every port of the router. It uses a
                 completely decentralized scheduling scheme, which
                 allows the design to: (1) take advantage of wide links,
                 (2) reduce Head of Line blocking, (3) use adaptive
                 routing, (4) be topology agnostic, (5) scale with
                 network degree, and (6) have reasonable power
                 consumption and implementation cost. A thorough
                 comparative performance analysis against competitive
                 conventional routers shows an advantage for our
                 proposal of up to 50 \% in terms of raw performance and
                 nearly 60 \% in terms of energy-delay product.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "chip multi-processors; interconnection networks;
                 router architecture",
}

@Article{Kim:2007:FBC,
  author =       "John Kim and William J. Dally and Dennis Abts",
  title =        "Flattened butterfly: a cost-efficient topology for
                 high-radix networks",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "2",
  pages =        "126--137",
  month =        may,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1250662.1250679",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:48:43 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Increasing integrated-circuit pin bandwidth has
                 motivated a corresponding increase in the degree or
                 radix of interconnection networks and their routers.
                 This paper introduces the flattened butterfly, a
                 cost-efficient topology for high-radix networks. On
                 benign (load-balanced) traffic, the flattened butterfly
                 approaches the cost/performance of a butterfly network
                 and has roughly half the cost of a comparable
                 performance Clos network. The advantage over the Clos
                 is achieved by eliminating redundant hops when they are
                 not needed for load balance. On adversarial traffic,
                 the flattened butterfly matches the cost/performance of
                 a folded-Clos network and provides an order of
                 magnitude better performance than a conventional
                 butterfly. In this case, global adaptive routing is
                 used to switch the flattened butterfly from minimal to
                 non-minimal routing --- using redundant hops only when
                 they are needed. Minimal and non-minimal, oblivious and
                 adaptive routing algorithms are evaluated on the
                 flattened butterfly. We show that load-balancing
                 adversarial traffic requires nonminimal
                 globally-adaptive routing and show that sequential
                 allocators are required to avoid transient load
                 imbalance when using adaptive routing algorithms. We
                 also compare the cost of the flattened butterfly to
                 folded-Clos, hypercube,and butterfly networks with
                 identical capacity and show that the flattened
                 butterfly is more cost-efficient than folded-Clos and
                 hypercube topologies.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "cost model; flattened butterfly; global adaptive
                 routing; high-radix routers; interconnection networks;
                 topology",
}

@Article{Kim:2007:NDD,
  author =       "Jongman Kim and Chrysostomos Nicopoulos and Dongkook
                 Park and Reetuparna Das and Yuan Xie and Vijaykrishnan
                 Narayanan and Mazin S. Yousif and Chita R. Das",
  title =        "A novel dimensionally-decomposed router for on-chip
                 communication in {$3$D} architectures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "2",
  pages =        "138--149",
  month =        may,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1273440.1250680",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:48:43 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Much like multi-story buildings in densely packed
                 metropolises, three-dimensional (3D) chip structures
                 are envisioned as a viable solution to skyrocketing
                 transistor densities and burgeoning die sizes in
                 multi-core architectures. Partitioning a larger die
                 into smaller segments and then stacking them in a 3D
                 fashion can significantly reduce latency and energy
                 consumption. Such benefits emanate from the notion that
                 inter-wafer distances are negligible compared to
                 intra-wafer distances. This attribute substantially
                 reduces global wiring length in 3D chips. The work in
                 this paper integrates the increasingly popular idea of
                 packet-based Networks-on-Chip (NoC) into a 3D setting.
                 While NoCs have been studied extensively in the 2D
                 realm, the microarchitectural ramifications of moving
                 into the third dimension have yet to be fully explored.
                 This paper presents a detailed exploration of
                 inter-strata communication architectures in 3D NoCs.
                 Three design options are investigated; a simple
                 bus-based inter-wafer connection, a hop-by-hop standard
                 3D design, and a full 3D crossbar implementation. In
                 this context, we propose a novel partially-connected 3D
                 crossbar structure, called the 3D
                 Dimensionally-Decomposed (DimDe) Router, which provides
                 a good tradeoff between circuit complexity and
                 performance benefits. Simulation results using (a) a
                 stand-alone cycle-accurate 3D NoC simulator running
                 synthetic workloads, and (b) a hybrid 3D NoC/cache
                 simulation environment running real commercial and
                 scientific benchmarks, indicate that the proposed DimDe
                 design provides latency and throughput improvements of
                 over 20\% on average over the other 3D architectures,
                 while remaining within 5\% of the full 3D crossbar
                 performance. Furthermore, based on synthesized hardware
                 implementations in 90 nm technology, the DimDe
                 architecture outperforms all other designs -- including
                 the full 3D crossbar -- by an average of 26\% in terms
                 of the Energy-Delay Product (EDP).",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "3D architecture; 3D integration; network-on-chip
                 (NoC)",
}

@Article{Kumar:2007:EVC,
  author =       "Amit Kumar and Li-Shiuan Peh and Partha Kundu and
                 Niraj K. Jha",
  title =        "Express virtual channels: towards the ideal
                 interconnection fabric",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "2",
  pages =        "150--161",
  month =        may,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1250662.1250681",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:48:43 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Due to wire delay scalability and bandwidth
                 limitations inherent in shared buses and dedicated
                 links, packet-switched on-chip interconnection networks
                 are fast emerging as the pervasive communication fabric
                 to connect different processing elements in many-core
                 chips. However, current state-of-the-art
                 packet-switched networks rely on complex routers which
                 increases the communication overhead and energy
                 consumption as compared to the ideal interconnection
                 fabric.\par

                 In this paper, we try to close the gap between the
                 state-of-the-art packet-switched network and the ideal
                 interconnect by proposing express virtual channels
                 (EVCs), a novel flow control mechanism which allows
                 packets to virtually bypass intermediate routers along
                 their path in a completely non-speculative fashion,
                 thereby lowering the energy/delay towards that of a
                 dedicated wire while simultaneously approaching ideal
                 throughput with a practical design suitable for on-chip
                 networks.\par

                 Our evaluation results using a detailed cycle-accurate
                 simulator on a range of synthetic traffic and SPLASH
                 benchmark traces show upto 84\% reduction in packet
                 latency and upto 23\% improvement in throughput while
                 reducing the average router energy consumption by upto
                 38\% over an existing state-of-the-art packet-switched
                 design. When compared to the ideal interconnect, EVCs
                 add just two cycles to the no-load latency, and are
                 within 14\% of the ideal throughput. Moreover, we show
                 that the proposed design incurs a minimal hardware
                 overhead while exhibiting excellent scalability with
                 increasing network sizes.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "flow-control; packet-switching; router design",
}

@Article{Kumar:2007:CAS,
  author =       "Sanjeev Kumar and Christopher J. Hughes and Anthony
                 Nguyen",
  title =        "{Carbon}: architectural support for fine-grained
                 parallelism on chip multiprocessors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "2",
  pages =        "162--173",
  month =        may,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1273440.1250683",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:48:43 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Chip multiprocessors (CMPs) are now commonplace, and
                 the number of cores on a CMP is likely to grow
                 steadily. However, in order to harness the additional
                 compute resources of a CMP, applications must expose
                 their thread-level parallelism to the hardware. One
                 common approach to doing this is to decompose a program
                 into parallel 'tasks' and allow an underlying software
                 layer to schedule these tasks to different threads.
                 Software task scheduling can provide good parallel
                 performance as long as tasks are large compared to the
                 software overheads.\par

                 We examine a set of applications from an important
                 emerging domain: Recognition, Mining, and Synthesis
                 (RMS). Many RMS applications are compute-intensive and
                 have abundant thread-level parallelism, and are
                 therefore good targets for running on a CMP. However, a
                 significant number have small tasks for which software
                 task schedulers achieve only limited parallel
                 speedups.\par

                 We propose Carbon, a hardware technique to accelerate
                 dynamic task scheduling on scalable CMPs. Carbon has
                 relatively simple hardware, most of which can be placed
                 far from the cores. We compare Carbon to some highly
                 tuned software task schedulers for a set of RMS
                 benchmarks with small tasks. Carbon delivers
                 significant performance improvements over the best
                 software scheduler: on average for 64 cores, 68\%
                 faster on a set of loop-parallel benchmarks, and 109\%
                 faster on a set of task-parallel benchmarks.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "architectural support; CMP; loop and task
                 parallelism",
}

@Article{Neelakantam:2007:HAR,
  author =       "Naveen Neelakantam and Ravi Rajwar and Suresh Srinivas
                 and Uma Srinivasan and Craig Zilles",
  title =        "Hardware atomicity for reliable software speculation",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "2",
  pages =        "174--185",
  month =        may,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1250662.1250684",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:48:43 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Speculative compiler optimizations are effective in
                 improving both single-thread performance and reducing
                 power consumption, but their implementation introduces
                 significant complexity, which can limit their adoption,
                 limit their optimization scope, and negatively impact
                 the reliability of the compilers that implement them.
                 To eliminate much of this complexity, as well as
                 increase the effectiveness of these optimizations, we
                 propose that microprocessors provide
                 architecturally-visible hardware primitives for atomic
                 execution. These primitives provide to the compiler the
                 ability to optimize the program's hot path in
                 isolation, allowing the use of non-speculative
                 formulations of optimization passes to perform
                 speculative optimizations. Atomic execution guarantees
                 that if a speculation invariant does not hold, the
                 speculative updates are discarded, the register state
                 is restored, and control is transferred to a
                 non-speculative version of the code, thereby relieving
                 the compiler from the responsibility of generating
                 compensation code.\par

                 We demonstrate the benefit of hardware atomicity in the
                 context of a Java virtual machine. We find
                 incorporating the notion of atomic regions into an
                 existing compiler intermediate representation to be
                 natural, requiring roughly 3,000 lines of code (~3\% of
                 a JVM's optimizing compiler), most of which were for
                 region formation. Its incorporation creates new
                 opportunities for existing optimization passes, as well
                 as greatly simplifying the implementation of additional
                 optimizations (e.g., partial inlining, partial loop
                 unrolling, and speculative lock elision). These
                 optimizations reduce dynamic instruction count by 11\%
                 on average and result in a 10-15\% average speedup,
                 relative to a baseline compiler with a similar degree
                 of inlining.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "atomicity; checkpoint; isolation; Java; optimization;
                 speculation",
}

@Article{Ipek:2007:CFA,
  author =       "Engin Ipek and Meyrem Kirman and Nevin Kirman and Jose
                 F. Martinez",
  title =        "Core fusion: accommodating software diversity in chip
                 multiprocessors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "2",
  pages =        "186--197",
  month =        may,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1273440.1250686",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:48:43 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "This paper presents core fusion, a reconfigurable chip
                 multiprocessor(CMP) architecture where groups of
                 fundamentally independent cores can dynamically morph
                 into a larger CPU, or they can be used as distinct
                 processing elements, as needed at run time by
                 applications. Core fusion gracefully accommodates
                 software diversity and incremental parallelization in
                 CMPs. It provides a single execution model across all
                 configurations, requires no additional programming
                 effort or specialized compiler support, maintains ISA
                 compatibility, and leverages mature micro-architecture
                 technology.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "chip multiprocessors; reconfigurable architectures;
                 software diversity",
}

@Article{Chi:2007:TQA,
  author =       "Eric Chi and Stephen A. Lyon and Margaret Martonosi",
  title =        "Tailoring quantum architectures to implementation
                 style: a quantum computer for mobile and persistent
                 qubits",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "2",
  pages =        "198--209",
  month =        may,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1250662.1250687",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:48:43 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "In recent years, quantum computing (QC) research has
                 moved from the realm of theoretical physics and
                 mathematics into real implementations. With many
                 different potential hardware implementations, quantum
                 computer architecture is a rich field with an
                 opportunity to solve interesting new problems and to
                 revisit old ones. This paper presents a QC architecture
                 tailored to physical implementations with highly mobile
                 and persistent quantum bits (qubits). Implementations
                 with qubit coherency times that are much longer than
                 operation times and qubit transportation times that are
                 orders of magnitude faster than operation times lend
                 greater flexibility to the architecture. This is
                 particularly true in the placement and locality of
                 individual qubits. For concreteness, we assume a
                 physical device model based on electron-spin qubits on
                 liquid helium (eSHe).\par

                 Like many conventional computer architectures, QCs
                 focus on the efficient exposure of parallelism. We
                 present here a QC microarchitecture that enjoys
                 increasing computational parallelism with size and
                 latency scaling only linearly with the number of
                 operations. Although an efficient and high level of
                 parallelism is admirable, quantum hardware is still
                 expensive and difficult to build, so we demonstrate how
                 the software may be optimized to reduce an
                 application's hardware requirements by 25\% with no
                 performance loss. Because the majority of a QC's time
                 and resources are devoted to quantum error correction,
                 we also present noise modeling results that evaluate
                 error correction procedures. These results demonstrate
                 that idle qubits in memory need only be refreshed
                 approximately once every one hundred operation
                 cycles.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "architecture; quantum",
}

@Article{Yang:2007:BSP,
  author =       "Xuejun Yang and Xiaobo Yan and Zuocheng Xing and Yu
                 Deng and Jiang Jiang and Ying Zhang",
  title =        "A 64-bit stream processor architecture for scientific
                 applications",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "2",
  pages =        "210--219",
  month =        may,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1273440.1250689",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:48:43 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Stream architecture is a novel microprocessor
                 architecture with wide application potential. But as
                 for whether it can be used efficiently in scientific
                 computing, many issues await further study. This paper
                 first gives the design and implementation of a 64-bit
                 stream processor, FT64 (Fei Teng 64), for scientific
                 computing. The carrying out of 64-bit extension design
                 and scientific computing oriented optimization are
                 described in such aspects as instruction set
                 architecture, stream controller, micro controller, ALU
                 cluster, memory hierarchy and interconnection interface
                 here. Second, two kinds of communications as message
                 passing and stream communications are put forward. An
                 interconnection based on the communications is designed
                 for FT64-based high performance computers. Third, a
                 novel stream programming language, SF95 (Stream
                 FORTRAN95), and its compiler, SF95Compiler (Stream
                 FORTRAN95 Compiler), are developed to facilitate the
                 development of scientific applications. Finally, nine
                 typical scientific application kernels are tested and
                 the results show the efficiency of stream architecture
                 for scientific computing.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "architecture; compiler; high performance computing;
                 program language; scientific application; stream
                 processor",
}

@Article{Hughes:2007:PSA,
  author =       "Christopher J. Hughes and Radek Grzeszczuk and
                 Eftychios Sifakis and Daehyun Kim and Sanjeev Kumar and
                 Andrew P. Selle and Jatin Chhugani and Matthew Holliman
                 and Yen-Kuang Chen",
  title =        "Physical simulation for animation and visual effects:
                 parallelization and characterization for chip
                 multiprocessors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "2",
  pages =        "220--231",
  month =        may,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1273440.1250690",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:48:43 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "We explore the emerging application area of
                 physics-based simulation for computer animation and
                 visual special effects. In particular, we examine its
                 parallelization potential and characterize its behavior
                 on a chip multiprocessor (CMP). Applications in this
                 domain model and simulate natural phenomena, and often
                 direct visual components of motion pictures. We study a
                 set of three workloads that exemplify the span and
                 complexity of physical simulation applications used in
                 a production environment: fluid dynamics, facial
                 animation, and cloth simulation. They are
                 computationally demanding, requiring from a few seconds
                 to several minutes to simulate a single frame;
                 therefore, they can benefit greatly from the
                 acceleration possible with large scale
                 CMPs.\par

                 Starting with serial versions of these applications, we
                 parallelize code accounting for at least 96\% of the
                 serial execution time, targeting a large number of
                 threads. We then study the most expensive modules using
                 a simulated 64-core CMP.\par

                 For the code representing key modules, we achieve
                 parallel scaling of 45x, 50x, and 30x for fluid, face,
                 and cloth simulations, respectively. The modules have a
                 spectrum of parallel task granularity and locking
                 behavior, and all but one are dominated by loop-level
                 parallelism. Many modules operate on streams of data.
                 In some cases, modules iterate over their data, leading
                 to significant temporal locality. This streaming
                 behavior leads to very high on-die and main memory
                 bandwidth requirements. Finally, most modules have
                 little inter-thread communication since they are
                 data-parallel, but a few require heavy communication
                 between data-parallel operations.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "characterization; CMP; parallelization; physical
                 simulation",
}

@Article{Yeh:2007:PAR,
  author =       "Thomas Y. Yeh and Petros Faloutsos and Sanjay J. Patel
                 and Glenn Reinman",
  title =        "{ParallAX}: an architecture for real-time physics",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "2",
  pages =        "232--243",
  month =        may,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1273440.1250691",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:48:43 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Future interactive entertainment applications will
                 feature the physical simulation of thousands of
                 interacting objects using explosions, breakable
                 objects, and cloth effects. While these applications
                 require a tremendous amount of performance to satisfy
                 the minimum frame rate of 30 FPS, there is a dramatic
                 amount of parallelism in future physics workloads. How
                 will future physics architectures leverage parallelism
                 to achieve the real-time constraint?.\par

                 We propose and characterize a set of forward-looking
                 benchmarks to represent future physics load and explore
                 the design space of future physics processors. In
                 response to the demand of this workload, we demonstrate
                 an architecture with a set of powerful cores and caches
                 to provide performance for the serial and coarse-grain
                 parallel components of physics simulation, along with a
                 flexible set of simple cores to exploit fine-grain
                 parallelism. Our architecture combines intelligent,
                 application-aware L2 management with dynamic
                 coupling\slash allocation of simple cores to complex
                 cores. Furthermore, we perform sensitivity analysis on
                 interconnect alternatives to determine how tightly to
                 couple these cores.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "application specific processor; chip multiprocessor;
                 interactive entertainment; physics based animation;
                 real-time physics; stream processing",
}

@Article{Kim:2007:AIB,
  author =       "Martha Mercaldi Kim and Mojtaba Mehrara and Mark Oskin
                 and Todd Austin",
  title =        "Architectural implications of brick and mortar silicon
                 manufacturing",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "2",
  pages =        "244--253",
  month =        may,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1250662.1250693",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:48:43 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "We introduce a novel chip fabrication technique called
                 'brick and mortar', in which chips are made from small,
                 pre-fabricated ASIC bricks and bonded in a
                 designer-specified arrangement to an inter-brick
                 communication backbone chip. The goal of brick and
                 mortar assembly is to provide a low-overhead method to
                 produce custom chips, yet with performance that tracks
                 an ASIC more closely than an FPGA. This paper examines
                 the architectural design choices in this chip-design
                 system. These choices include the definition of
                 reasonable bricks, both in functionality and size, as
                 well as the communication interconnect that the I/O cap
                 provides. To do this we synthesize candidate bricks,
                 analyze their area and bandwidth demands, and present
                 an architectural design for the inter-brick
                 communication network. We discuss a sample chip design,
                 a 16-way CMP, and analyze the costs and benefits of
                 designing chips with brick and mortar. We find that
                 this method of producing chips incurs only a small
                 performance loss (8\%) compared to a fully custom ASIC,
                 which is significantly less than the degradation seen
                 from other low-overhead chip options, such as FPGAs.
                 Finally, we measure the effect that architectural
                 design decisions have on the behavior of the proposed
                 physical brick assembly technique, fluidic
                 self-assembly.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "chip assembly; design re-use; interconnect design",
}

@Article{Amin:2007:APA,
  author =       "Ahmed M. Amin and Mithuna Thottethodi and T. N.
                 Vijaykumar and Steven Wereley and Stephen C. Jacobson",
  title =        "{Aquacore}: a programmable architecture for
                 microfluidics",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "2",
  pages =        "254--265",
  month =        may,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1273440.1250694",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:48:43 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Advances in microfluidic research has enabled
                 lab-on-a-chip (LoC) technology to achieve
                 miniaturization and integration of biological and
                 chemical analyses to a single chip comprising channels,
                 valves, mixers, heaters, separators, and sensors. These
                 miniature instruments appear to offer the rare
                 combination of faster, cheaper, and higher-precision
                 analyses in comparison to conventional bench-scale
                 methods. LoCs have been applied to diverse domains such
                 as proteomics, genomics, biochemistry, virology, cell
                 biology, and chemical synthesis. However, to date LoCs
                 have been designed as application-specific chips which
                 incurs significant design effort, turn-around time, and
                 cost, and degrades designer and user productivity. To
                 address these limitations, we envision a programmable
                 LoC (PLoC) and propose a comprehensive fluidic
                 instruction set, called AquaCore Instruction Set (AIS),
                 and a fluidic microarchitecture, called AquaCore, to
                 implement AIS. We present four key design aspects in
                 which the AIS and AquaCore differ from their computer
                 counterparts, and our design decisions made on the
                 basis of the implications of these differences. We
                 demonstrate the use of the PLoC in a range of domains
                 by hand-compiling real-world microfluidic assays in
                 AIS, and show a detailed breakdown of the execution
                 times for the assays and an estimate of the chip
                 area.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "fluidic; fluidic microarchitecture; instruction set;
                 microfluidics; programmable lab on a chip",
}

@Article{Wenisch:2007:MSW,
  author =       "Thomas F. Wenisch and Anastasia Ailamaki and Babak
                 Falsafi and Andreas Moshovos",
  title =        "Mechanisms for store-wait-free multiprocessors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "2",
  pages =        "266--277",
  month =        may,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1273440.1250696",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:48:43 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Store misses cause significant delays in shared-memory
                 multiprocessors because of limited store buffering and
                 ordering constraints required for proper
                 synchronization. Today, programmers must choose from a
                 spectrum of memory consistency models that reduce store
                 stalls at the cost of increased programming complexity.
                 Prior research suggests that the performance gap among
                 consistency models can be closed through
                 speculation--enforcing order only when dynamically
                 necessary. Unfortunately, past designs either provide
                 insufficient buffering, replace all stores with
                 read-modify-write operations, and/or recover from
                 ordering violations via impractical fine-grained
                 rollback mechanisms.\par

                 We propose two mechanisms that, together, enable
                 store-wait-free implementations of any memory
                 consistency model. To eliminate buffer-capacity-related
                 stalls, we propose the scalable store buffer, which
                 places private/speculative values directly into the L1
                 cache, thereby eliminating the non-scalable associative
                 search of conventional store buffers. To eliminate
                 ordering-related stalls, we propose atomic sequence
                 ordering, which enforces ordering constraints over
                 coarse-grain access sequences while relaxing order
                 among individual accesses. Using cycle-accurate
                 full-system simulation of scientific and commercial
                 applications, we demonstrate that these mechanisms
                 allow the simplified programming of strict ordering
                 while outperforming conventional implementations on
                 average by 32\% (sequential consistency), 22\% (SPARC
                 total store order) and 9\% (SPARC relaxed memory
                 order).",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "memory consistency models; store buffer design",
}

@Article{Ceze:2007:BBE,
  author =       "Luis Ceze and James Tuck and Pablo Montesinos and
                 Josep Torrellas",
  title =        "{BulkSC}: bulk enforcement of sequential consistency",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "2",
  pages =        "278--289",
  month =        may,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1273440.1250697",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:48:43 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "While Sequential Consistency (SC) is the most
                 intuitive memory consistency model and the one most
                 programmers likely assume, current multiprocessors do
                 not support it. Instead, they support more relaxed
                 models that deliver high performance. SC
                 implementations are considered either too slow or --
                 when they can match the performance of relaxed models
                 -- too difficult to implement.\par

                 In this paper, we propose Bulk Enforcement of SC
                 (BulkSC), a novel way of providing SC that is simple to
                 implement and offers performance comparable to Release
                 Consistency (RC). The idea is to dynamically group sets
                 of consecutive instructions into chunks that appear to
                 execute atomically and in isolation. The hardware
                 enforces SC at the coarse grain of chunks which, to the
                 program, appears as providing SC at the individual
                 memory access level. BulkSC keeps the implementation
                 simple by largely decoupling memory consistency
                 enforcement from processor structures. Moreover, it
                 delivers high performance by enabling full memory
                 access reordering and overlapping within chunks and
                 across chunks. We describe a complete system
                 architecture that supports BulkSC and show that it
                 delivers performance comparable to RC.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "bulk; chip multiprocessors; memory consistency models;
                 programmability; sequential consistency",
}

@Article{Diniz:2007:LPC,
  author =       "Bruno Diniz and Dorgival Guedes and Wagner {Meira,
                 Jr.} and Ricardo Bianchini",
  title =        "Limiting the power consumption of main memory",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "2",
  pages =        "290--301",
  month =        may,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1250662.1250699",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:48:43 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "The peak power consumption of hardware components
                 affects their power supply, packaging, and cooling
                 requirements. When the peak power consumption is high,
                 the hardware components or the systems that use them
                 can become expensive and bulky. Given that components
                 and systems rarely (if ever) actually require peak
                 power, it is highly desirable to limit power
                 consumption to a less-than-peak power budget, based on
                 which power supply, packaging, and cooling
                 infrastructure scan be more intelligently
                 provisioned.\par

                 In this paper, we study dynamic approaches for limiting
                 the power consumption of main memories. Specifically,
                 we propose four techniques that limit consumption by
                 adjusting the power states of the memory devices, as a
                 function of the load on the memory subsystem. Our
                 simulations of applications from three benchmarks
                 demonstrate that our techniques can consistently limit
                 power to a pre-established budget. Two of the
                 techniques can limit power with very low performance
                 degradation. Our results also show that, when using
                 these superior techniques, limiting power is at least
                 as effective an energy-conservation approach as
                 state-of-the-art techniques explicitly designed for
                 performance-aware energy conservation. These latter
                 results represent a departure from current energy
                 management research and practice.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "main memory; performance; power and energy
                 management",
}

@Article{Mesa-Martinez:2007:PMV,
  author =       "Francisco Javier Mesa-Martinez and Joseph
                 Nayfach-Battilana and Jose Renau",
  title =        "Power model validation through thermal measurements",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "2",
  pages =        "302--311",
  month =        may,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1250662.1250700",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:48:43 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Simulation environments are an indispensable tool in
                 the design, prototyping, performance evaluation, and
                 analysis of computer systems. Simulator must be able to
                 faithfully reflect the behavior of the system being
                 analyzed. To ensure the accuracy of the simulator, it
                 must be verified and determined to closely match
                 empirical data. Modern processors provide enough
                 performance counters to validate the majority of the
                 performance models; nevertheless, the information
                 provided is not enough to validate power and thermal
                 models.\par

                 In order to address some of the difficulties associated
                 with the validation of power and thermal models, this
                 paper proposes an infrared measurement setup to capture
                 run-time power consumption and thermal characteristics
                 of modern chips. We use infrared cameras with high
                 spatial resolution ($ 10 \times 10 $ $ \mu $ m) and
                 high frame rate (125fps) to capture thermal maps. To
                 generate a detailed power breakdown (leakage and
                 dynamic) for each processor floorplan unit, we employ
                 genetic algorithms. The genetic algorithm finds a power
                 equation for each floorplan block that produces the
                 measured temperature for a given thermal package. The
                 difference between the predicted power and the
                 externally measured power consumption for an AMD Athlon
                 analyzed in this paper has less than 1\% discrepancy.
                 As an example of applicability, we compare the obtained
                 measurements with CACTI power models, and propose
                 extensions to existing thermal models to increase
                 accuracy.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "power and thermal measurements",
}

@Article{Lin:2007:TMM,
  author =       "Jiang Lin and Hongzhong Zheng and Zhichun Zhu and
                 Howard David and Zhao Zhang",
  title =        "Thermal modeling and management of {DRAM} memory
                 systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "2",
  pages =        "312--322",
  month =        may,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1250662.1250701",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:48:43 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "With increasing speed and power density,
                 high-performance memories, including FB-DIMM (Fully
                 Buffered DIMM) and DDR2 DRAM, now begin to require
                 dynamic thermal management (DTM) as processors and hard
                 drives did. The DTM of memories, nevertheless, is
                 different in that it should take the processor
                 performance and power consumption into consideration.
                 Existing schemes have ignored that. In this study, we
                 investigate a new approach that controls the memory
                 thermal issues from the source generating memory
                 activities - the processor. It will smooth the program
                 execution when compared with shutting down memory
                 abruptly, and therefore improve the overall system
                 performance and power efficiency. For multicore
                 systems, we propose two schemes called adaptive core
                 gating and coordinated DVFS. The first scheme activates
                 clock gating on selected processor cores and the second
                 one scales down the frequency and voltage levels of
                 processor cores when the memory is to be over-heated.
                 They can successfully control the memory activities and
                 handle thermal emergency. More importantly, they
                 improve performance significantly under the given
                 thermal envelope. Our simulation results show that
                 adaptive core gating improves performance by up to
                 23.3\% (16.3\% on average) on a four-core system with
                 FB-DIMM when compared with DRAM thermal shutdown; and
                 coordinated DVFS with control-theoretic methods
                 improves the performance by up to 18.5\% (8.3\% on
                 average).",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "DRAM memories; thermal management; thermal modeling",
}

@Article{Tiwari:2007:RPA,
  author =       "Abhishek Tiwari and Smruti R. Sarangi and Josep
                 Torrellas",
  title =        "{ReCycle}: pipeline adaptation to tolerate process
                 variation",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "2",
  pages =        "323--334",
  month =        may,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1250662.1250703",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:48:43 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Process variation affects processor pipelines by
                 making some stages slower and others faster, therefore
                 exacerbating pipeline unbalance. This reduces the
                 frequency attainable by the pipeline. To improve
                 performance, this paper proposes ReCycle, an
                 architectural framework that comprehensively applies
                 cycle time stealing to the pipeline - transferring the
                 time slack of the faster stages to the slow ones by
                 skewing clock arrival times to latching elements after
                 fabrication. As a result, the pipeline can be clocked
                 with a period equal to the average stage delay rather
                 than the longest one. In addition, ReCycle's frequency
                 gains are enhanced with Donor stages, which are empty
                 stages added to 'donate' slack to the slow stages.
                 Finally, ReCycle can also convert slack into power
                 reductions.\par

                 For a 17FO4 pipeline, ReCycle increases the frequency
                 by 12\% and the application performance by 9\% on
                 average. Combining ReCycle and donor stages delivers
                 improvements of 36\% in frequency and 15\% in
                 performance on average, completely reclaiming the
                 performance losses due to variation.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "clock skew; pipeline; process variation",
}

@Article{Sassone:2007:MSR,
  author =       "Peter G. Sassone and Jeff {Rupley II} and Edward
                 Brekelbaum and Gabriel H. Loh and Bryan Black",
  title =        "Matrix scheduler reloaded",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "2",
  pages =        "335--346",
  month =        may,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1273440.1250704",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:48:43 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "From multiprocessor scale-up to cache sizes to the
                 number of reorder-buffer entries, microarchitects wish
                 to reap the benefits of more computing resources while
                 staying within power and latency bounds. This tension
                 is quite evident in schedulers, which need to be large
                 and single-cycle for maximum performance on
                 out-of-order cores. In this work we present two
                 straightforward modifications to a matrix scheduler
                 implementation which greatly strengthen its
                 scalability. Both are based on the simple observation
                 that the wakeup and picker matrices are sparse, even at
                 small sizes; thus small indirection tables can be used
                 to greatly reduce their width and latency. This
                 technique can be used to create quicker iso-performance
                 schedulers (17-58\% reduced critical path) or larger
                 iso-timing schedulers (7-26\% IPC increase).
                 Importantly, the power and area requirements of the
                 additional hardware are likely offset by the greatly
                 reduced matrix sizes and subsuming the functionality of
                 the power-hungry allocation CAMs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "matrix; microarchitecture; picker; scheduler; wakeup",
}

@Article{Sethumadhavan:2007:LBE,
  author =       "Simha Sethumadhavan and Franziska Roesner and Joel S.
                 Emer and Doug Burger and Stephen W. Keckler",
  title =        "Late-binding: enabling unordered load-store queues",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "2",
  pages =        "347--357",
  month =        may,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1273440.1250705",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:48:43 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Conventional load/store queues (LSQs) are an
                 impediment to both power-efficient execution in
                 superscalar processors and scaling to large-window
                 designs. In this paper, we propose techniques to
                 improve the area and power efficiency of LSQs by
                 allocating entries when instructions issue ('late
                 binding'), rather than when they are dispatched. This
                 approach enables lower occupancy and thus smaller LSQs.
                 Efficient implementations of late-binding LSQs,
                 however, require the entries in the LSQ to be unordered
                 with respect to age. In this paper, we show how to
                 provide full LSQ functionality in an unordered design
                 with only small additional complexity and negligible
                 performance losses. We show that late-binding,
                 unordered LSQs work well for small-window superscalar
                 processors, but can also be scaled effectively to
                 large, kilo-window processors by breaking the LSQs into
                 address-interleaved banks. To handle the increased
                 overflows, we apply classic network flow control
                 techniques to the processor micronetworks, enabling
                 low-overhead recovery mechanisms from bank overflows.
                 We evaluate three such mechanisms: instruction replay,
                 skid buffers, and virtual-channel buffering in the
                 on-chip memory network. We show that for an
                 80-instruction window, the LSQ can be reduced to 32
                 entries. For a 1024-instruction window, the unordered,
                 late-binding LSQ works well with four banks of 48
                 entries each. By applying a Bloom filter as well, this
                 design achieves full hardware memory disambiguation for
                 a 1,024 instruction window while requiring low average
                 power per load and store access of 8 and 12 CAM
                 entries, respectively.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "late binding; memory disambiguation; network flow
                 control",
}

@Article{Leverich:2007:CMS,
  author =       "Jacob Leverich and Hideho Arakida and Alex
                 Solomatnikov and Amin Firoozshahian and Mark Horowitz
                 and Christos Kozyrakis",
  title =        "Comparing memory systems for chip multiprocessors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "2",
  pages =        "358--368",
  month =        may,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1273440.1250707",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:48:43 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "There are two basic models for the on-chip memory in
                 CMP systems: hardware-managed coherent caches and
                 software-managed streaming memory. This paper performs
                 a direct comparison of the two models under the same
                 set of assumptions about technology, area, and
                 computational capabilities. The goal is to quantify how
                 and when they differ in terms of performance, energy
                 consumption, bandwidth requirements, and latency
                 tolerance for general-purpose CMPs. We demonstrate that
                 for data-parallel applications, the cache-based and
                 streaming models perform and scale equally well. For
                 certain applications with little data reuse, streaming
                 scales better due to better bandwidth use and
                 macroscopic software prefetching. However, the
                 introduction of techniques such as hardware prefetching
                 and non-allocating stores to the cache-based model
                 eliminates the streaming advantage. Overall, our
                 results indicate that there is not sufficient advantage
                 in building streaming memory systems where all on-chip
                 memory structures are explicitly managed. On the other
                 hand, we show that streaming at the programming model
                 level is particularly beneficial, even with the
                 cache-based model, as it enhances locality and creates
                 opportunities for bandwidth optimizations. Moreover, we
                 observe that stream programming is actually easier with
                 the cache-based model because the hardware guarantees
                 correct, best-effort execution even when the programmer
                 cannot fully regularize an application's code.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "chip multiprocessors; coherent caches; locality
                 optimizations; parallel programming; streaming memory",
}

@Article{Muralimanohar:2007:IDC,
  author =       "Naveen Muralimanohar and Rajeev Balasubramonian",
  title =        "Interconnect design considerations for large {NUCA}
                 caches",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "2",
  pages =        "369--380",
  month =        may,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1273440.1250708",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:48:43 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "The ever increasing sizes of on-chip caches and the
                 growing domination of wire delay necessitate
                 significant changes to cache hierarchy design
                 methodologies. Many recent proposals advocate splitting
                 the cache into a large number of banks and employing a
                 network-on-chip (NoC) to allow fast access to nearby
                 banks (referred to as Non-Uniform Cache
                 Architectures--NUCA). Most studies on NUCA
                 organizations have assumed a generic NoC and focused on
                 logical policies for cache block placement, movement,
                 and search. Since wire/router delay and power are major
                 limiting factors in modern processors, this work
                 focuses on interconnect design and its influence on
                 NUCA performance and power. We extend the widely-used
                 CACTI cache modeling tool to take network design
                 parameters into account. With these overheads
                 appropriately accounted for, the optimal cache
                 organization is typically very different from that
                 assumed in prior NUCA studies. To alleviate the
                 interconnect delay bottleneck, we propose novel cache
                 access optimizations that introduce heterogeneity
                 within the inter-bank network. The careful
                 consideration of interconnect choices for a large cache
                 results in a 51\% performance improvement over a
                 baseline generic NoC and the introduction of
                 heterogeneity within the network yields an additional
                 11-15\% performance improvement.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "cache models; interconnect; memory hierarchies;
                 network-on-chip; non-uniform cache architecture",
}

@Article{Qureshi:2007:AIP,
  author =       "Moinuddin K. Qureshi and Aamer Jaleel and Yale N. Patt
                 and Simon C. Steely and Joel Emer",
  title =        "Adaptive insertion policies for high performance
                 caching",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "2",
  pages =        "381--391",
  month =        may,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1250662.1250709",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:48:43 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "The commonly used LRU replacement policy is
                 susceptible to thrashing for memory-intensive workloads
                 that have a working set greater than the available
                 cache size. For such applications, the majority of
                 lines traverse from the MRU position to the LRU
                 position without receiving any cache hits, resulting in
                 inefficient use of cache space. Cache performance can
                 be improved if some fraction of the working set is
                 retained in the cache so that at least that fraction of
                 the working set can contribute to cache hits.\par

                 We show that simple changes to the insertion policy can
                 significantly reduce cache misses for memory-intensive
                 workloads. We propose the LRU Insertion Policy (LIP)
                 which places the incoming line in the LRU position
                 instead of the MRU position. LIP protects the cache
                 from thrashing and results in close to optimal hit rate
                 for applications that have a cyclic reference pattern.
                 We also propose the Bimodal Insertion Policy (BIP) as
                 an enhancement of LIP that adapts to changes in the
                 working set while maintaining the thrashing protection
                 of LIP. We finally propose a Dynamic Insertion Policy
                 (DIP) to choose between BIP and the traditional LRU
                 policy depending on which policy incurs fewer misses.
                 The proposed insertion policies do not require any
                 change to the existing cache structure, are trivial to
                 implement, and have a storage requirement of less than
                 two bytes. We show that DIP reduces the average MPKI of
                 the baseline 1MB 16-way L2 cache by 21\%, bridging
                 two-thirds of the gap between LRU and OPT.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "replacement; set dueling; set sampling; thrashing",
}

@Article{Karger:2007:PSL,
  author =       "Paul A. Karger",
  title =        "Performance and security lessons learned from
                 virtualizing the {Alpha} processor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "2",
  pages =        "392--401",
  month =        may,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1273440.1250711",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:48:43 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Virtualization has become much more important
                 throughout the computer industry both to improve
                 security and to support multiple workloads on the same
                 hardware with effective isolation between those
                 workloads. The most widely used chip architecture, the
                 Intel and AMD x86 processors, have begun to support
                 virtualization, but the initial implementations show
                 some limitations. This paper examines the
                 virtualization properties of the Alpha architecture
                 with particular emphasis on features that improve
                 performance and security. It shows how the Alpha's
                 features of PALcode, address space numbers, software
                 handling of translation buffer misses, lack of used and
                 modified bits, and secure handling of unpredictable
                 results all contribute to making virtualization of the
                 Alpha particularly easy. The paper then compares the
                 virtual architecture of the Alpha with Intel's and
                 AMD's virtualization approaches for x86. It also
                 comments briefly on Intel's virtualization technology
                 for Itanium, IBM's zSeries and pSeries hypervisors and
                 Sun's UltraSPARC virtualization. It particularly
                 identifies some differences between translation buffers
                 on x86 and translation buffers on VAX and Alpha that
                 can have adverse performance consequences.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "hypervisors; security; virtual machine monitors;
                 virtualizability",
}

@Article{Karkhanis:2007:ADA,
  author =       "Tejas S. Karkhanis and James E. Smith",
  title =        "Automated design of application specific superscalar
                 processors: an analytical approach",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "2",
  pages =        "402--411",
  month =        may,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1250662.1250712",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:48:43 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Analytical modeling is applied to the automated design
                 of application-specific superscalar processors. Using
                 an analytical method bridges the gap between the size
                 of the design space and the time required for detailed
                 cycle-accurate simulations. The proposed design
                 framework takes as inputs the design targets (upper
                 bounds on execution time, area, and energy), design
                 alternatives, and one or more application programs. The
                 output is the set of out-of-order superscalar
                 processors that are Pareto-optimal with respect to
                 performance-energy-area. The core of the new design
                 framework is made up of analytical performance and
                 energy activity models, and an analytical model-based
                 design optimization process.\par

                 For a set of benchmark programs and a design space of
                 2000 designs, the design framework arrives at all
                 performance-energy-area Pareto-optimal design points
                 within 16 minutes on a 2 GHz Pentium-4. In contrast, it
                 is estimated that a na{\"\i}ve cycle-accurate
                 simulation-based exhaustive search would require at
                 least two months to arrive at the Pareto-optimal design
                 points for the same design space.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "analytical model; application specific processors;
                 design optimization; energy model; performance model",
}

@Article{Phansalkar:2007:ARA,
  author =       "Aashish Phansalkar and Ajay Joshi and Lizy K. John",
  title =        "Analysis of redundancy and application balance in the
                 {SPEC CPU2006} benchmark suite",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "2",
  pages =        "412--423",
  month =        may,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1250662.1250713",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:48:43 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "The recently released SPEC CPU2006 benchmark suite is
                 expected to be used by computer designers and computer
                 architecture researchers for pre-silicon early design
                 analysis. Partial use of benchmark suites by
                 researchers, due to simulation time constraints,
                 compiler difficulties, or library or system call issues
                 is likely to happen; but a random subset can lead to
                 misleading results. This paper analyzes the SPEC
                 CPU2006 benchmarks using performance counter based
                 experimentation from several state of the art systems,
                 and uses statistical techniques such as principal
                 component analysis and clustering to draw inferences on
                 the similarity of the benchmarks and the redundancy in
                 the suite and arrive at meaningful subsets.\par

                 The SPEC CPU2006 benchmark suite contains several
                 programs from areas such as artificial intelligence and
                 includes none from the electronic design automation
                 (EDA) application area. Hence there is a concern on the
                 application balance in the suite. An analysis from the
                 perspective of fundamental program characteristics
                 shows that the included programs offer characteristics
                 broader than the EDA programs' space. A subset of 6
                 integer programs and 8 floating point programs can
                 yield most of the information from the entire suite.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "benchmark; clustering; microprocessor performance
                 counters; SPEC",
}

@Article{Kim:2007:VPR,
  author =       "Hyesoon Kim and Jos{\'e} A. Joao and Onur Mutlu and
                 Chang Joo Lee and Yale N. Patt and Robert Cohn",
  title =        "{VPC} prediction: reducing the cost of indirect
                 branches via hardware-based dynamic devirtualization",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "2",
  pages =        "424--435",
  month =        may,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1250662.1250715",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:48:43 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Indirect branches have become increasingly common in
                 modular programs written in modern object-oriented
                 languages and virtual machine based runtime systems.
                 Unfortunately, the prediction accuracy of indirect
                 branches has not improved as much as that of
                 conditional branches. Furthermore, previously proposed
                 indirect branch predictors usually require a
                 significant amount of extra hardware storage and
                 complexity, which makes them less attractive to
                 implement.\par

                 This paper proposes a new technique for handling
                 indirect branches, called Virtual Program Counter (VPC)
                 prediction. The key idea of VPC prediction is to treat
                 a single indirect branch as multiple virtual
                 conditional branches in hardware for prediction
                 purposes. Our technique predicts each of the virtual
                 conditional branches using the existing conditional
                 branch prediction hardware. Thus, no separate storage
                 structure is required for predicting indirect branch
                 targets.\par

                 Our evaluation shows that VPC prediction improves
                 average performance by 26.7\% compared to a
                 commonly-used branch target buffer based predictor on
                 12 indirect branch intensive applications. VPC
                 prediction achieves the performance improvement
                 provided by at least a 12KB (and usually a 192KB)
                 tagged target cache predictor on half of the examined
                 applications. We show that VPC prediction can be used
                 with any existing conditional branch prediction
                 mechanism and that the accuracy of VPC prediction
                 improves when a more accurate conditional branch
                 predictor is used.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "devirtualization; indirect branch prediction; virtual
                 functions",
}

@Article{Hilton:2007:GCI,
  author =       "Andrew D. Hilton and Amir Roth",
  title =        "{Ginger}: control independence using tag rewriting",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "2",
  pages =        "436--447",
  month =        may,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1250662.1250716",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:48:43 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "The negative performance impact of branch
                 mis-predictions can be reduced by exploiting control
                 independence (CI). When a branch mis-predicts, the
                 wrong-path instructions up to the point where control
                 converges with the correct path are selectively
                 squashed and replaced with correct-path instructions.
                 Instructions beyond the convergence-point-the branch's
                 control-independent (CI) instructions-are spared from
                 squashing. Exploiting CI requires updating the input
                 data dependences of CI instructions to reflect the
                 selective removal and insertion of logically older
                 instructions and transitively re-dispatching those CI
                 instructions whose inputs have changed. This capability
                 is generally called out-of-order renaming. Previously
                 proposed CI designs use out-of-order renaming schemes
                 that either consume excessive rename/dispatch
                 bandwidth, can only be applied in limited cases, or
                 incur a cost even when the branch would be correctly
                 predicted.\par

                 Ginger is a CI design that is both general and
                 bandwidth efficient. Ginger implements out-of-order
                 renaming using tag rewriting, re-linking the input
                 dependences of CI instructions as they sit in the
                 window. To do this, Ginger halts the pipeline uses the
                 idle map table read and write ports and the issue queue
                 match lines and write lines to perform a register-tag
                 'search-and-replace' operation. After a few cycles, the
                 pipeline restarts and execution resumes with correct
                 data dependences. Cycle-level simulation shows that
                 Ginger out-performs previous CI designs, yielding
                 geometric mean speedups over an aggressive non-CI
                 processor of 5\%, 12\%, and 11\%-on SPECint2000,
                 MediaBench, and Comm-Bench-with speedups of 15\% or
                 greater on 11 of 46 programs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "branch misprediction; control independence;
                 out-of-order renaming; selective re-dispatch",
}

@Article{Al-Zawawi:2007:TCI,
  author =       "Ahmed S. Al-Zawawi and Vimal K. Reddy and Eric
                 Rotenberg and Haitham H. Akkary",
  title =        "Transparent control independence {(TCI)}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "2",
  pages =        "448--459",
  month =        may,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1273440.1250717",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:48:43 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Superscalar architectures have been proposed that
                 exploit control independence, reducing the performance
                 penalty of branch mispredictions by preserving the work
                 of future misprediction-independent instructions. The
                 essential goal of exploiting control independence is to
                 completely decouple future misprediction-independent
                 instructions from deferred misprediction-dependent
                 instructions. Current implementations fall short of
                 this goal because they explicitly maintain program
                 order among misprediction-independent and
                 misprediction-dependent instructions. Explicit
                 approaches sacrifice design efficiency and ultimately
                 performance.\par

                 We observe it is sufficient to emulate program order.
                 Potential misprediction-dependent instructions are
                 singled out a priori and their unchanging source values
                 are checkpointed. These instructions and values are set
                 aside as a 'recovery program'. Checkpointed source
                 values break the data dependencies with co-mingled
                 misprediction-independent instructions - now long since
                 gone from the pipeline - achieving the essential
                 decoupling objective. When the mispredicted branch
                 resolves, recovery is achieved by fetching the
                 self-sufficient, condensed recovery program. Recovery
                 is effectively transparent to the pipeline, in that
                 speculative state is not rolled back and recovery
                 appears as a jump to code. A coarse-grain retirement
                 substrate permits the relaxed order between the
                 decoupled programs. Transparent control independence
                 (TCI) yields a highly streamlined pipeline that quickly
                 recycles resources based on conventional speculation,
                 enabling a large window with small cycle-critical
                 resources, and prevents many mispredictions from
                 disrupting this large window.\par

                 TCI achieves speedups as high as 64\% (16\% average)
                 and 88\% (22\% average) for 4-issue and 8-issue
                 pipelines, respectively, among 15 SPEC integer
                 benchmarks. Factors that limit the performance of
                 explicitly ordered approaches are quantified.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "branch prediction; checkpoints; control independence;
                 selective re-execution; selective recovery;
                 speculation",
}

@Article{Wang:2007:EAA,
  author =       "Nicholas J. Wang and Aqeel Mahesri and Sanjay J.
                 Patel",
  title =        "Examining {ACE} analysis reliability estimates using
                 fault-injection",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "2",
  pages =        "460--469",
  month =        may,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1273440.1250719",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:48:43 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "ACE analysis is a technique to provide an early
                 reliability estimate for microprocessors. ACE analysis
                 couples data from abstract performance models with low
                 level design details to identify and rule out transient
                 faults that will not cause incorrect execution. While
                 many transient faults are analyzable in ACE analysis
                 frameworks, some are not. As a result, ACE analysis is
                 conservative and provides a lower bound for the
                 reliability of a processor design. Bounding the
                 reliability of a design is useful since it can
                 guarantee that the given design will meet reliability
                 goals.\par

                 In this work, we quantify and identify the sources of
                 ACE analysis conservatism by comparing an ACE analysis
                 methodology against a rigorous fault-injection study.
                 We evaluate two flavors of ACE analysis: a 'simple'
                 analysis and a refined analysis, finding that even the
                 refined analysis overestimates the soft error
                 vulnerability of an instruction scheduler by 2-3x. The
                 conservatism stems from two key sources: from lack of
                 detail in abstract performance models and from what we
                 term Y-Bits, a result of the single-pass simulation
                 methodology that is typical of ACE analysis. We also
                 examine the efficacy of applying ACE analysis to a
                 class of 'partial coverage' error mitigation
                 techniques. In particular, we perform a case study on
                 one such technique and extrapolate our findings to
                 others.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "fault tolerance; measurement techniques;
                 microprocessors; soft errors",
}

@Article{Aggarwal:2007:CIB,
  author =       "Nidhi Aggarwal and Parthasarathy Ranganathan and
                 Norman P. Jouppi and James E. Smith",
  title =        "Configurable isolation: building high availability
                 systems with commodity multi-core processors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "2",
  pages =        "470--481",
  month =        may,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1250662.1250720",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:48:43 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "High availability is an increasingly important
                 requirement for enterprise systems, often valued more
                 than performance. Systems designed for high
                 availability typically use redundant hardware for error
                 detection and continued uptime in the event of a
                 failure. Chip multiprocessors with an abundance of
                 identical resources like cores, cache and
                 interconnection networks would appear to be ideal
                 building blocks for implementing high availability
                 solutions on chip. However, doing so poses significant
                 challenges with respect to error containment and faulty
                 component replacement. Increasing silicon and transient
                 fault rates with future technology scaling exacerbate
                 the problem. This paper proposes a novel,
                 cost-effective, architecture for high availability
                 systems built from future multi-core processors. We
                 propose a new chip multiprocessor architecture that
                 provides configurable isolation for fault containment
                 and component retirement, based upon cost-effective
                 modifications to commodity designs. The design is
                 evaluated for a state-of-the-art industrial fault model
                 and the proposed architecture is shown to provide
                 effective fault isolation and graceful degradation even
                 when the failure rate is high.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "chip multiprocessors; fault isolation; high
                 availability",
}

@Article{Dalton:2007:RFI,
  author =       "Michael Dalton and Hari Kannan and Christos
                 Kozyrakis",
  title =        "{Raksha}: a flexible information flow architecture for
                 software security",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "2",
  pages =        "482--493",
  month =        may,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1250662.1250722",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:48:43 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "High-level semantic vulnerabilities such as SQL
                 injection and cross-site scripting have surpassed
                 buffer overflows as the most prevalent security
                 exploits. The breadth and diversity of software
                 vulnerabilities demand new security solutions that
                 combine the speed and practicality of hardware
                 approaches with the flexibility and robustness of
                 software systems.\par

                 This paper proposes Raksha, an architecture for
                 software security based on dynamic information flow
                 tracking (DIFT). Raksha provides three novel features
                 that allow for a flexible hardware/software approach to
                 security. First, it supports flexible and programmable
                 security policies that enable software to direct
                 hardware analysis towards a wide range of high-level
                 and low-level attacks. Second, it supports multiple
                 active security policies that can protect the system
                 against concurrent attacks. Third, it supports
                 low-overhead security handlers that allow software to
                 correct, complement, or extend the hardware-based
                 analysis without the overhead associated with operating
                 system traps.\par

                 We present an FPGA prototype for Raksha that provides a
                 full featured Linux workstation for security analysis.
                 Using unmodified binaries for real-world applications,
                 we demonstrate that Raksha can detect high-level
                 attacks such as directory traversal, command injection,
                 SQL injection, and cross-site scripting as well as
                 low-level attacks such as buffer overflows. We also
                 show that low overhead exception handling is critical
                 for analyses such as memory corruption protection in
                 order to address false positives that occur due to the
                 diverse code patterns in frequently used software.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "dynamic; semantic vulnerabilities; software security",
}

@Article{Wang:2007:NCD,
  author =       "Zhenghong Wang and Ruby B. Lee",
  title =        "New cache designs for thwarting software cache-based
                 side channel attacks",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "2",
  pages =        "494--505",
  month =        may,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1250662.1250723",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:48:43 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Software cache-based side channel attacks are a
                 serious new class of threats for computers. Unlike
                 physical side channel attacks that mostly target
                 embedded cryptographic devices, cache-based side
                 channel attacks can also undermine general purpose
                 systems. The attacks are easy to perform, effective on
                 most platforms, and do not require special instruments
                 or excessive computation power. In recently
                 demonstrated attacks on software implementations of
                 ciphers like AES and RSA, the full key can be recovered
                 by an unprivileged user program performing simple
                 timing measurements based on cache misses.\par

                 We first analyze these attacks, identifying cache
                 interference as the root cause of these attacks. We
                 identify two basic mitigation approaches: the
                 partition-based approach eliminates cache interference
                 whereas the randomization-based approach randomizes
                 cache interference so that zero information can be
                 inferred. We present new security-aware cache designs,
                 the Partition-Locked cache (PLcache) and Random
                 Permutation cache (RPcache), analyze and prove their
                 security, and evaluate their performance. Our results
                 show that our new cache designs with built-in security
                 can defend against cache-based side channel attacks in
                 general-rather than only specific attacks on a given
                 cryptographic algorithm-with very little performance
                 degradation and hardware cost.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "cache; computer architecture; processor; security;
                 side channel; timing attacks",
}

@Article{Soundararajan:2007:MBV,
  author =       "Niranjan Kumar Soundararajan and Angshuman Parashar
                 and Anand Sivasubramaniam",
  title =        "Mechanisms for bounding vulnerabilities of processor
                 structures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "2",
  pages =        "506--515",
  month =        may,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1250662.1250725",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:48:43 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Concern for the increasing susceptibility of processor
                 structures to transient errors has led to several
                 recent research efforts that propose architectural
                 techniques to enhance reliability. However, real
                 systems are typically required to satisfy hard
                 reliability budgets, and barring expensive
                 full-redundancy approaches, none of the proposed
                 solutions treat any reliability budgets or bounds as
                 hard constraints. Meeting vulnerability bounds requires
                 monitoring vulnerabilities of processor structures and
                 taking appropriate actions whenever these bounds are
                 violated. This mandates treating reliability as a
                 first-order microarchitecture design constraint, while
                 optimizing performance as long as reliability
                 requirements are satisfied. This paper makes three key
                 contributions towards this goal: (i) we present a
                 simple infrastructure to monitor and provide upper
                 bounds on the vulnerabilities of key processor
                 structures at cycle-level fidelity; (ii) we propose two
                 distinct control mechanisms - throttling and selective
                 redundancy - to proactively and/or reactively bound the
                 vulnerabilities to any limit specified by the system
                 designer; (iii) within this framework, we propose a
                 novel adaptation of Out-of-Order Commit for
                 vulnerability reduction, which automatically provides
                 additional leverage for the control mechanisms to boost
                 performance while remaining within the reliability
                 budget.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "microarchitecture; redundant threading; transient
                 faults",
}

@Article{Walcott:2007:DPA,
  author =       "Kristen R. Walcott and Greg Humphreys and Sudhanva
                 Gurumurthi",
  title =        "Dynamic prediction of architectural vulnerability from
                 microarchitectural state",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "2",
  pages =        "516--527",
  month =        may,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1250662.1250726",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:48:43 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Transient faults due to particle strikes are a key
                 challenge in microprocessor design. Driven by
                 exponentially increasing transistor counts, per-chip
                 faults are a growing burden. To protect against soft
                 errors, redundancy techniques such as redundant
                 multithreading (RMT) are often used. However, these
                 techniques assume that the probability that a
                 structural fault will result in a soft error (i.e., the
                 Architectural Vulnerability Factor (AVF)) is 100
                 percent, unnecessarily draining processor resources.
                 Due to the high cost of redundancy, there have been
                 efforts to throttle RMT at runtime. To date, these
                 methods have not incorporated an AVF model and
                 therefore tend to be ad hoc. Unfortunately, computing
                 the AVF of complex microprocessor structures (e.g., the
                 ISQ) can be quite involved.\par

                 To provide probabilistic guarantees about fault
                 tolerance, we have created a rigorous characterization
                 of AVF behavior that can be easily implemented in
                 hardware. We experimentally demonstrate AVF variability
                 within and across the SPEC2000 benchmarks and identify
                 strong correlations between structural AVF values and a
                 small set of processor metrics. Using these simple
                 indicators as predictors, we create a proof-of-concept
                 RMT implementation that demonstrates that AVF
                 prediction can be used to maintain a low fault
                 tolerance level without significant performance
                 impact.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "architecture vulnerability factor; microarchitecture;
                 performance; redundant multithreading; reliability",
}

@Article{Aggarwal:2007:ISI,
  author =       "Aneesh Aggarwal and Pradip Bose and Mohamed Zahran",
  title =        "Introduction to the special issue on the {2006
                 Reconfigurable and Adaptive Architecture Workshop}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "3",
  pages =        "1--1",
  month =        jun,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1294313.1294317",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:48:27 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "The papers that follow comprise the proceedings of the
                 first Reconfigurable and Adaptive Architecture Workshop
                 (RAAW 2006) that was held in conjunction with the
                 39$^{th}$ International Conference on Microarchitecture
                 in Orlando, Florida.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Bellas:2007:MSA,
  author =       "Nikolaos Bellas and Sek M. Chai and Malcolm Dwyer and
                 Dan Linzmeier",
  title =        "Mapping streaming architectures on reconfigurable
                 platforms",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "3",
  pages =        "2--8",
  month =        jun,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1294313.1294318",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:48:27 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Hardware accelerators, used as application-specific
                 extensions to the computational capabilities of a
                 system, are efficient mechanisms to enhance the
                 performance and reduce the power dissipation in a
                 System On Chip (SoC). These accelerators execute on the
                 computationally critical part of the application, and
                 offload computations from the scalar processors. In
                 this paper, we present a design automation tool that
                 generates accelerators based on a given application
                 kernel. The accelerators are processing streaming data,
                 and support a programming model which can naturally
                 express a large number of embedded applications, and
                 which results in efficient and fast hardware
                 implementations. We demonstrate the applicability of
                 the tool for architectural space exploration for a
                 number of media applications, with results on area,
                 throughput, and clock speeds.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Labrecque:2007:CCG,
  author =       "Martin Labrecque and Peter Yiannacouras and J. Gregory
                 Steffan",
  title =        "Custom code generation for soft processors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "3",
  pages =        "9--19",
  month =        jun,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1294313.1294319",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:48:27 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Embedded systems designers that use FPGAs are
                 increasingly including soft processors in their designs
                 (configurable processors built in the programmable
                 logic of the FPGA). While there has been a significant
                 amount of research on adding custom instructions and
                 accelerators to soft processors, these are typically
                 used to extend an unmodified base ISA targeted by
                 generic compilation such as with unmodified gcc. In
                 this paper we explore several opportunities for the
                 compiler to optimize the code generated for soft
                 processors through application-specific customization
                 of the base ISA---techniques that are orthogonal to
                 adding custom instructions. In particular we explore:
                 (i) low level software-hardware trade-offs between
                 basic instructions; (ii) the utility of ISA-specific
                 features---in particular for the delay slots and Hi/Lo
                 registers in the MIPS ISA; and (iii) application
                 specific register management. We find that through
                 these techniques that have no hardware cost we can
                 improve the area efficiency of soft processors by 12\%
                 on average across a suite of benchmarks, and by up to
                 47\% in the best case.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Suri:2007:IIL,
  author =       "Tameesh Suri",
  title =        "Improving instruction level parallelism through
                 reconfigurable units in superscalar processors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "3",
  pages =        "20--27",
  month =        jun,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1294313.1294320",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:48:27 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "With reducing feature sizes, more transistors can be
                 integrated on the chip. The increased transistor budget
                 can be utilized to improve the instruction level
                 parallelism (ILP) exploited from the processor.
                 However, the transistors cannot be used to arbitrarily
                 increase the processor width and size in the hope of
                 exploiting better ILP. In this paper, we propose an
                 architecture where the superscalar datapath is tightly
                 coupled with a reconfigurable unit (RFU). The
                 reconfiguration unit is configured to execute the
                 traces of dynamic instructions that are frequently
                 executed. To address the data dependency issues between
                 the instructions in the superscalar and the RFU, we
                 propose to execute the trace on the RFU with predicted
                 values. When the trace instructions reach the issue
                 queue in the superscalar, the predictions are
                 validated. In this technique, performance improvement
                 is obtained for correct prediction, whereas no
                 performance degradation is incurred for mispredictions.
                 With this architecture, we observe an average
                 instructions per cycle (IPC) improvement of about 11\%
                 over the simulated SPEC 2000 benchmarks, using a very
                 small last value data value predictor.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Najaf-abadi:2007:ACE,
  author =       "Hashem H. Najaf-abadi and Eric Rotenberg",
  title =        "Architectural {\em contesting\/}: exposing and
                 exploiting temperamental behavior",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "3",
  pages =        "28--35",
  month =        jun,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1294313.1294321",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:48:27 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Previous studies have proposed techniques to
                 dynamically change the architecture of a processor to
                 better suit the characteristics of the workload at
                 hand. However, all such approaches are prone to a
                 fundamental trade-off between the architectural
                 diversity they can provide and the latency of
                 architectural change, their fixed-configuration
                 performance and the complexity of finding the best
                 architectural configuration for the workload at hand.
                 In this study we argue that the full potential of
                 dynamic architectural customization can only be
                 achieved by diminishing the effect of the degree of
                 available architectural diversity on the aforementioned
                 performance factors.\par

                 The performance of a statically designed processing
                 core in a heterogeneous multi-core system is
                 independent of the architectural diversity available.
                 In addition, it is apparent that concurrent execution
                 of code on differently architected cores automatically
                 reveals which architecture is more suitable for the
                 characteristics of a particular workload.\par

                 We therefore propose architectural contesting; the
                 redundant execution of code on a number of differently
                 architected processors (each customized for a different
                 set of workload characteristics) in a leader follower
                 arrangement, such that the leader and follower cores
                 continuously shift roles as one core or the other
                 becomes more favorable for new code phases. In this
                 manner effective execution is naturally transferred
                 from one static architecture to the other with little
                 latency.\par

                 In this study, we show that the contesting of only
                 processor width can yield an average speedup of 7.5\%
                 and up to 12.5\% in integer SPEC benchmarks.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Tseng:2007:DHS,
  author =       "Kuo-Kun Tseng and Ying-Dar Lin and Tsern-Huei Lee and
                 Yuan-Cheng Lai",
  title =        "Deterministic high-speed root-hashing automaton
                 matching coprocessor for embedded network processor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "3",
  pages =        "36--43",
  month =        jun,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1294313.1294314",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:48:27 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "While string matching plays an important role in deep
                 packet inspection applications, its software algorithms
                 are insufficient to meet the demands of high-speed
                 performance. Accordingly, we were motivated to propose
                 fast and deterministic performance root-hashing
                 automaton matching (RHAM) coprocessor for embedded
                 network processor. Although automaton algorithms are
                 robust with deterministic matching time, there is still
                 plenty of room for improvement of their average-case
                 performance. The proposed RHAM employs novel
                 root-hashing technique to accelerate automaton
                 matching. In our experiment, RHAM is implemented in a
                 prevalent automaton algorithm, Aho--Corasick (AC) which
                 is often used in many packet inspection applications.
                 Compared to the original AC, RHAM only requires extra
                 vector size in 48 Kbytes for root-hashing, and has
                 about 900\% and 420\% outperformance for 20,000 URLs
                 and 10,000 virus patterns respectively. Implementation
                 of RHAM FPGA can perform at the rate of 12.6 Gbps with
                 the pattern amount in 34,215 bytes. This is superior to
                 all previous matching hardware in terms of throughput
                 and pattern set.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "coprocessor; finite automaton; hashing; packet
                 inspection; string matching",
}

@Article{Sibai:2007:PAW,
  author =       "Fadi N. Sibai",
  title =        "Performance analysis and workload characterization of
                 the {$3$DMark05} benchmark on modern parallel computer
                 platforms",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "3",
  pages =        "44--52",
  month =        jun,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1294313.1294315",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:48:27 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "With ever increasing CPU and graphics card speeds, and
                 improved sophistication, stunning visual effects, and
                 growing scene detail and real life-like content of 3D
                 games, 3DMark{\reg} emerged as the leading PC benchmark
                 for 3D gaming performance with several millions of
                 worldwide downloads. Its tests are at the cutting edge
                 of consumer graphics and push the limit of 3D rendering
                 with spectacular scenes, and state of the art lighting
                 techniques. The benchmark scores help quickly
                 differentiate the platforms with state of the art
                 graphic cards and processors from those with older
                 components. In this paper, we analyze the scaling of
                 the 3DMark{\reg}05 benchmark with CPU frequency, number
                 of CPUs, number of GPUs, and number of threads
                 supported by the hardware. We also characterize the
                 benchmark's workload. These results reveal that the
                 benchmark scales well indicating that 3D games if
                 implemented with multiple Physics and Artificial
                 Intelligence or other relevant content threads should
                 show good scaling too on multi-CPU and multi-GPU
                 platforms. The characterization results reveal the
                 close dependence of 3D graphics applications on the
                 memory subsystem's performance as 1 out of 2
                 instructions is a load or store instruction. The
                 results also revealed that there is a direct
                 correlation with the Game Tests' performance and the
                 number of cache memory read misses per instruction
                 retired, the number of stores retired per instruction
                 retired, the number of polygons per Draw*Primitive; and
                 the number of set-vertex shader calls per frame. All
                 these events relate to the memory subsystem performance
                 generally linking the 3D graphics applications'
                 performance and the 3DMark{\reg} overall score to the
                 platform's memory performance. Salient
                 microarchitectural performance events of the CPU tests
                 were also memory-related.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "3D graphics performance; multiple CPU and GPU core
                 platforms; workload characterization",
}

@Article{Thorson:2007:INb,
  author =       "Mark Thorson",
  title =        "{Internet} nuggets",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "3",
  pages =        "53--55",
  month =        jun,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1294313.1294323",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:48:27 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "This column consists of selected traffic from the
                 comp.arch newsgroup, a forum for discussion of computer
                 architecture on the Internet---an international
                 computer network.\par

                 As always, the opinions expressed in this column are
                 the personal views of the authors, and do not
                 necessarily represent the institutions to which they
                 are affiliated.\par

                 Text which sets the context of a message appears
                 underlined or in italics; this is usually text the
                 author has quoted from earlier messages. The code-like
                 expressions below the authors' names are their
                 addresses on Internet.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Bartolini:2007:MPD,
  author =       "S. Bartolini and P. Foglia and C. A. Prete",
  title =        "{MEmory} performance: {DEaling} with applications,
                 systems and architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "4",
  pages =        "4--5",
  month =        sep,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1327312.1327314",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:50:54 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "In this issue, we present the papers from MEDEA-2006
                 Workshop [3] held in conjunction with the IEEE-ACM
                 International Conference on Parallel Architectures and
                 Compilation Techniques (PACT-2006) [1,2].",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "Medea 2006 workshop.",
}

@Article{Lorton:2007:ABL,
  author =       "K. Patrick Lorton and David S. Wise",
  title =        "Analyzing block locality in {Morton}-order and
                 {Morton}-hybrid matrices",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "4",
  pages =        "6--12",
  month =        sep,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1327312.1327315",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:50:54 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "As the architectures of computers change, introducing
                 more caches onto multicore chips, even more locality
                 becomes necessary. With the bandwidth between caches
                 and RAM now even more valuable, additional locality
                 from new matrix representations will be important to
                 keep multiple processors busy. The default storage
                 representations of both C and Fortran, row- and
                 column-major respectively, have fundamental
                 deficiencies with many matrix computations. By
                 switching the storage representation from Cartesian to
                 block indices, one is able to take better advantage of
                 cache locality at all levels from L1 to paging. This
                 paper only changes storage representation from
                 row-major to Morton-hybrid, and applies it to matrix
                 multiplication. Its purpose is to show that, even with
                 only traditional iterative algorithms, simply changing
                 storage representation offers significant speedups.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "Cholesky factorization; Morton order; quadtrees",
  remark =       "Medea 2006 workshop.",
}

@Article{Deris:2007:ICE,
  author =       "Kaveh Jokar Deris and Amirali Baniasadi",
  title =        "Investigating cache energy and latency break-even
                 points in high performance processors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "4",
  pages =        "13--20",
  month =        sep,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1327312.1327316",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:50:54 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "In this work we study how cache complexity impacts
                 energy and performance in high performance processors.
                 Moreover, we estimate cache energy budget for two high
                 performance processors. We calculate energy and latency
                 break-even points for realistic and ideal cache
                 organizations for different applications. We show that
                 design efforts made to reduce cache miss rate are only
                 justifiable from the energy and performance point of
                 view only if the associated latency and energy overhead
                 remain below the calculated break-even
                 points.\par

                 Furthermore, we show that, for the processors and
                 applications studied here, the instruction cache has a
                 lower latency break-even point compared to the data
                 cache. However, investing energy in the data cache is
                 likely to result in better energy efficiency compared
                 to the instruction cache.\par

                 We also study alternative cache configurations for
                 different processors and investigate if such
                 alternatives would improve energy efficiency.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "Medea 2006 workshop.",
}

@Article{Yan:2007:EIC,
  author =       "Jun Yan and Wei Zhang",
  title =        "Evaluating instruction cache vulnerability to
                 transient errors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "4",
  pages =        "21--28",
  month =        sep,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1327312.1327317",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:50:54 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Recent research shows that microprocessors are
                 increasingly susceptible to transient errors. In order
                 to protect microprocessors cost-effectively, the first
                 step is to accurately understand the impact of
                 transient errors on the system reliability. While many
                 research efforts have been focused on studying the
                 vulnerability of data caches and other on-chip hardware
                 components, instruction caches have received less
                 attention. However, instructions are read every cycle,
                 any undetected or uncorrected soft errors in
                 instructions can lead to erroneous computation, wrong
                 control flow or system crash.\par

                 This paper studies the instruction cache vulnerability
                 by considering both the raw SRAM rate and the cache
                 vulnerability factor. Based on the concept of cache
                 vulnerability factor, we also investigate the impact of
                 different cache configuration parameters on the
                 reliability of instruction caches. We find that on
                 average 67.5\% of instruction cache soft errors can be
                 masked by the I-cache itself without impacting other
                 system components. While quantifying the instruction
                 cache vulnerability itself does not solve the
                 reliability problem of instruction cache against
                 transient errors, we believe this work can provide
                 useful insights for designers to develop cost-effective
                 solutions to protect I-caches and to optimally balance
                 the reliability of instruction caches with other system
                 goals, such as cost, performance and energy.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "Medea 2006 workshop.",
}

@Article{Ramirez:2007:EST,
  author =       "Tanaus{\'u} Ram{\'\i}rez and Alex Pajuelo and Oliverio
                 J. Santana and Mateo Valero",
  title =        "Energy saving through a simple load control
                 mechanism",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "4",
  pages =        "29--36",
  month =        sep,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1327312.1327318",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:50:54 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "To alleviate the memory wall problem, current
                 architectural trends suggest implementing large
                 instruction windows able to maintain a high number of
                 in-flight instructions. However, the benefits achieved
                 by these recent proposals may be limited because more
                 instructions are executed down the wrong path of a
                 mispredicted branch. The larger number of misspeculated
                 instructions involves increasing the energy consumed
                 compared to traditional designs with smaller
                 instruction windows. Our analysis shows that, for some
                 SPEC2000 integer benchmarks, up to 2, 5X wrong-path
                 load instructions are executed when the instruction
                 window of a 4-way superscalar processor is increased
                 from 256 to 1024 entries.\par

                 This paper describes a simple speculative control
                 technique to prevent wrong-path load instructions from
                 being executed. Our technique extends the functionality
                 of the load-store queue to block those load
                 instructions that depend on a hard-to-predict
                 conditional branch until it is resolved. If the branch
                 is actually mispredicted, unnecessary cache misses can
                 be avoided, saving energy down the wrong path.
                 Furthermore, instructions that depend on a blocked load
                 are not issued because their source values are not
                 available, which also saves dynamic energy. Our results
                 show that the proposed mechanism reduces, on average,
                 up to 26\% misspeculated load instructions and 18\%
                 wrong-path instructions without any performance loss.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "branch prediction; confidence estimation; energy
                 saving; kilo-instruction processors",
  remark =       "Medea 2006 workshop.",
}

@Article{Ramos:2007:DPC,
  author =       "Luis M. Ramos and Jos{\'e} Luis Briz and Pablo E.
                 Ib{\'a}{\~n}ez and Victor Vi{\~n}als",
  title =        "Data prefetching in a cache hierarchy with high
                 bandwidth and capacity",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "4",
  pages =        "37--44",
  month =        sep,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1327312.1327319",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:50:54 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "In this paper we evaluate four hardware data
                 prefetchers in the context of a high-performance
                 three-level on chip cache hierarchy with high bandwidth
                 and capacity. We consider two classic prefetchers
                 (Sequential Tagged and Stride) and two correlating
                 prefetchers: PC/DC, a recent method with a superior
                 score and low-sized tables, and P-DFCM, a new method.
                 Like PC/DC, P-DFCM focuses on local delta sequences,
                 but it is based on the DFCM value predictor. We explore
                 different prefetch degrees and distances. Running
                 SPEC2000, Olden and IAbench applications, results show
                 that this kind of cache hierarchy turns prefetching
                 aggressiveness into success for the four prefetchers.
                 Sequential Tagged is the best, and deserves further
                 attention to cut it losses in some applications. PC/DC
                 results are matched or even improved by P-DFCM, using
                 far fewer accesses to tables while keeping sizes low.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "hardware data prefeching",
  remark =       "Medea 2006 workshop.",
}

@Article{Dybdahl:2007:LBR,
  author =       "Haakon Dybdahl and Per Stenstr{\"o}m and Lasse
                 Natvig",
  title =        "An {LRU}-based replacement algorithm augmented with
                 frequency of access in shared chip-multiprocessor
                 caches",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "4",
  pages =        "45--52",
  month =        sep,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1327312.1327320",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:50:54 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "This paper proposes a new replacement algorithm to
                 protect cache lines with potential future reuse from
                 being evicted. In contrast to the recency based
                 approaches used in the past (LRU for example), our
                 algorithm also uses the notion of frequency of access.
                 Instead of evicting the least recently used block, our
                 algorithm identifies among a set of LRU blocks the one
                 that is also least-frequently-used (according to a
                 heuristic) and chooses that as a victim. We have
                 implemented this replacement algorithm in a detailed
                 simulation model of a chip multiprocessor system driven
                 by SPEC2000 benchmarks. We have found that the new
                 scheme improves performance for memory intensive
                 applications. Moreover, as compared to other attempts,
                 our replacement algorithm provides robust improvements
                 across all benchmarks. We have also extended an earlier
                 scheme proposed by Wong and Baer so it is switched off
                 when performance is not improved. Our results show that
                 this makes the scheme much more suitable for CMP
                 configurations.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "Medea 2006 workshop.",
}

@Article{Bardine:2007:IPE,
  author =       "A. Bardine and P. Foglia and G. Gabrielli and C. A.
                 Prete and P. Stenstr{\"o}m",
  title =        "Improving power efficiency of {D-NUCA} caches",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "4",
  pages =        "53--58",
  month =        sep,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1327312.1327321",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:50:54 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "D-NUCA caches are cache memories that, thanks to
                 banked organization, broadcast search and
                 promotion/demotion mechanism, are able to tolerate the
                 increasing wire delay effects introduced by technology
                 scaling. As a consequence, they will outperform
                 conventional caches (UCA, Uniform Cache Architectures)
                 in future generation cores.\par

                 Due to the promotion/demotion mechanism, we have found
                 that, in a D-NUCA cache, the distribution of hits on
                 the ways varies across applications as well as across
                 different execution phases within a single application.
                 In this paper, we show how such a behavior can be
                 utilized to improve D-NUCA power efficiency as well as
                 to decrease its access latencies. In particular, we
                 propose a new D-NUCA structure, called Way Adaptable
                 D-NUCA cache, in which the number of active (i.e.
                 powered-on) ways is dynamically adapted to the need of
                 the running application. Our initial evaluation shows
                 that a consistent reduction of both the average number
                 of active ways (42\% in average) and the number of bank
                 access requests (29\% in average) is achieved, without
                 significantly affecting the IPC.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "Medea 2006 workshop.",
}

@Article{Thorson:2007:INc,
  author =       "Mark Thorson",
  title =        "{Internet} nuggets",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "4",
  pages =        "59--62",
  month =        sep,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1327312.1327323",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:50:54 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "This column consists of selected traffic from the
                 comp. arch newsgroup, a forum for discussion of
                 computer architecture on the Internet---an
                 international computer network.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kise:2007:SIA,
  author =       "Kenji Kise and Toshinori Sato and Hironori Nakajo",
  title =        "Special issue: {ALPS'07 -- Advanced Low Power
                 Systems}: Introduction",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "5",
  pages =        "1--2",
  month =        dec,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1360464.1360469",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:51:13 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "In this issue, we present the papers from the
                 proceedings of the 2nd International Workshop on
                 Advanced Low Power Systems (ALPS 2007) that was held in
                 conjunction with the 21st International Conference on
                 Supercomputing in Seattle.\par

                 'Thoughtfulness' is an important keyword in the both
                 current and future technologies in all over the world:
                 Thoughtful to human being, thoughtful to our
                 surroundings, thoughtful to the earth, and so on. For
                 the thoughtfulness, Low-power is believed to be one of
                 the most indispensable keyword. The ALPS workshop
                 focuses on the current technological challenges in
                 developing low-power and power-aware computing systems
                 ranging from servers to embedded devices. The goal of
                 the workshop is to bring all aspects of power-aware
                 computing from industry and academia.\par

                 This year, we have one invited talk entitled 'An Under
                 2W 100GOPS Video Recognition Processor Based on a
                 Linear Array of 128 4-Way VLIW Processing Elements' by
                 Shorin Kyo (NEC Corporation) and 6 papers selected
                 based on the full paper review by the program committee
                 members.\par

                 The first set of papers discusses low-power designs. We
                 have three papers: 'Optimal Pipeline Depth with
                 Pipeline Stage Unification Adoption' by Jun Yao, Hajime
                 Shimada, Shinobu Miwa, and Shinji Tomita, 'VCLEARIT: A
                 VLSI CMOS Circuit Leakage Reduction Technique For
                 Nanoscale Technologies' by Preetham Lakshmikanthan and
                 Adrian Nunez, and 'Leakage Energy Reduction in Cache
                 Memory by Data Compression' by Kiyofumi Tanaka and
                 Takahiro Kawahara.\par

                 The second set of papers: 'Preventing Timing Errors on
                 Register Writes: Mechanisms of Detections and
                 Recoveries' by Hidetsugu Irie, Ken Sugimoto, Masahiro
                 Goshima, and Suichi Sakai, 'Not Multi-, but Many-Core:
                 Designing Integral Parallel Architectures for Embedded
                 Computation' by Mihaela Malita, Gheorghe Stefany, and
                 Dominique Thi{\'e}baut, and 'Fine-grain Compensation
                 Method with Consideration of Trade-offs between
                 Computation and Data Transfer for Power Consumption' by
                 Takefumi Miyoshi and Nobuhiko Sugino, covers
                 reliability, many-core and parallelization
                 issues.\par

                 All papers here are going to create the way to the new
                 aspects of low-power systems. We hope you will find the
                 papers of this special issue of Computer Architecture
                 News to be stimulating and that you will be inspired to
                 contribute your efforts to the future low power
                 systems.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Yao:2007:OPD,
  author =       "Jun Yao and Shinobu Miwa and Hajime Shimada and Shinji
                 Tomita",
  title =        "Optimal pipeline depth with pipeline stage unification
                 adoption",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "5",
  pages =        "3--9",
  month =        dec,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1360464.1360470",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:51:13 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "To find the optimal pipeline design point by
                 considering both performance and power objectives has
                 been one focus of interest in recent researches.
                 However, we found that previous papers did not consider
                 deepening or shrinking pipeline depth dynamically
                 during the program execution. In this paper, with the
                 adoption of the earlier proposed Pipeline Stage
                 Unification (PSU) method, we studied the relationship
                 between power/performance and pipeline depth in
                 processors with a pipeline of multi-usable depths. Our
                 evaluation results of SPECint2000 benchmarks shown in
                 this paper illustrate that the PSU adoption can achieve
                 good efficiency for platforms which concern both energy
                 and performance, even after the utilization of complex
                 clock gating.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "pipeline design point; pipeline stage unification;
                 power/performance",
}

@Article{Lakshmikanthan:2007:VVC,
  author =       "Preetham Lakshmikanthan and Adrian Nu{\~n}ez",
  title =        "{VCLEARIT}: a {VLSI CMOS} circuit leakage reduction
                 technique for nanoscale technologies",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "5",
  pages =        "10--16",
  month =        dec,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1360464.1360471",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:51:13 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Leakage power loss is a major concern in
                 deep-submicron technologies as it drains the battery
                 even when a circuit is completely idle. In this paper,
                 we first present a novel leakage reduction technique
                 and then compare and contrast it with other well
                 established leakage reduction techniques. Our leakage
                 reduction technique achieves cancellation of leakage
                 effects in both the pull-up network (PUN) as well as
                 the pull-down network (PDN) for CMOS circuits. It
                 involves voltage balancing in the PUN and PDN paths
                 using a combination of high- V$_T$ (high voltage
                 threshold) and standard- V$_T$ sleep transistors.
                 Experiments conducted on a variety of multi-level
                 combinational MCNC'91 benchmarks show significant
                 savings in leakage power (upto 3 orders of magnitude),
                 with lesser area and delay penalty using our leakage
                 reduction technique when compared to other
                 techniques.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Tanaka:2007:LER,
  author =       "Kiyofumi Tanaka and Takahiro Kawahara",
  title =        "Leakage energy reduction in cache memory by data
                 compression",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "5",
  pages =        "17--24",
  month =        dec,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1360464.1360472",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:51:13 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Cache memory is effective in bridging a growing speed
                 gap between a processor and relatively slow external
                 main memory. Almost all of today's commercial
                 processors, not only high-performance microprocessors
                 but embedded ones, have on-chip cache memories.
                 However, energy consumption in the cache memory would
                 approach or exceed 50\% of the total consumption by the
                 processors, which leads to a serious problem in terms
                 of allowable temperature and performance improvement.
                 An important point to note is that, in the near future,
                 static (leakage) energy will dominate the energy
                 consumption in deep sub-micron processes. In this
                 paper, we propose cache memory architecture that
                 exploits gated-Vdd control per cache block and a
                 dynamic data compression scheme in the secondary cache,
                 and achieves efficient reduction of static energy
                 consumed by the secondary cache memory. In the
                 simulation using SPEC95 integer benchmarks, our
                 technique reduced about 45\% of leakage energy in the
                 cache at maximum, and about 28\% on average.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "cache memory; data compression; gated-Vdd; leakage
                 energy",
}

@Article{Irie:2007:PTE,
  author =       "Hidetsugu Irie and Ken Sugimoto and Masahiro Goshima
                 and Shuich Sakai",
  title =        "Preventing timing errors on register writes:
                 mechanisms of detections and recoveries",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "5",
  pages =        "25--31",
  month =        dec,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1360464.1360473",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:51:13 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "To deal with the increasing variations of the
                 intra-chip transistors, one promising approach is to
                 dynamically detect and recover the timing-errors with
                 microarchitecutre. This will induce dependability and
                 efficiency into microprocessors because it allows VLSI
                 to operate at the optimum frequency and voltage while
                 ensuring accuracy.\par

                 A few approaches for dynamically detecting
                 timing-errors have been proposed, but none of them have
                 focused on register writes. In this paper, we propose a
                 technique for detecting and recovering from timing
                 errors during register writes. We introduce a verifying
                 technique that uses additional buffer (called the write
                 assurance buffer (WAB)) which is provided with a
                 sufficient timing margin. The evaluation results reveal
                 a performance degradation of 4.5\% using an 8-entry
                 WAB; this value becomes negligible when a 16-entry WAB
                 is used.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Malita:2007:MMC,
  author =       "Mihaela Mali{\c{t}}a and Gheorghe {\c{S}}tefan and
                 Dominique Thi{\'e}baut",
  title =        "Not multi-, but many-core: designing integral parallel
                 architectures for embedded computation",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "5",
  pages =        "32--38",
  month =        dec,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1360464.1360474",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:51:13 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Recent embedded systems have switched to fully
                 programmable parallel architectures. To make sure all
                 corner cases usually present in real applications are
                 supported and efficiently implemented in this switch of
                 implementation, new solutions must be found. We
                 introduce the integral parallel architecture (IPA) as a
                 solution supporting intensive data computation in
                 System-on-a-chip (Soc) implementations, fitting in a
                 small area, and requiring low power. An IPA supports
                 naturally all three possible styles of parallelism:
                 data, time, and speculative.\par

                 As an illustrative example, we present the BA1024 chip,
                 a fully programmable SoC designed by BrightScale, Inc.
                 for HDTV codecs. Its main performance figures include
                 60 GOPS/Watt and 2 GOPS/mm$^2$, representing an
                 efficient IPA approach for embedded computation.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "embedded systems; parallel architectures; programmable
                 systems; video processing",
}

@Article{Miyoshi:2007:FGC,
  author =       "Takefumi Miyoshi and Nobuhiko Sugino",
  title =        "Fine-grain compensation method with consideration of
                 trade-offs between computation and data transfer for
                 power consumption",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "5",
  pages =        "39--44",
  month =        dec,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1360464.1360475",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:51:13 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Fine-grain parallelizing method with consideration of
                 the number of data transfers for low power consumption
                 is proposed. In the proposed method, power consumption
                 by data transfers between processor elements in a
                 multiprocessor is focused on, and the number of data
                 transfers is reduced.\par

                 In this paper, a measure based on the relationship
                 between variables in a given program is defined to
                 evaluate the number of data transfers, firstly. And
                 then a proposed compensation method by use of the
                 evaluation of power consumption based on the measure is
                 explained. Finally, the result of applying proposed
                 compensation method implemented on COINS framework to
                 several example programs is shown.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Romanescu:2007:VSC,
  author =       "Bogdan F. Romanescu and Michael E. Bauer and Sule Ozev
                 and Daniel J. Sorin",
  title =        "{VariaSim}: simulating circuits and systems in the
                 presence of process variability",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "5",
  pages =        "45--48",
  month =        dec,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1360464.1360465",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:51:13 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "In this paper, we present VariaSim, the publicly
                 available Static Statistical Timing Analysis (SSTA)
                 Tool from Duke University. VariaSim enables researchers
                 to analyze the impact of CMOS process variability on
                 the behavior of circuits and systems.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Venkateswaran:2007:FGSa,
  author =       "N. Venkateswaran and Deepak Srinivasan and Madhavan
                 Manivannan and T. P. Ramnath Sai Sagar and Shyamsundar
                 Gopalakrishnan and VinothKrishnan Elangovan and Karthik
                 Chandrasekar and Prem Kumar Ramesh and Viswanath
                 Venkatesan and Arvindakshan Babu and Sudharshan",
  title =        "Future generation supercomputers {I}: a paradigm for
                 node architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "5",
  pages =        "49--60",
  month =        dec,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1360464.1360466",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:51:13 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "As a result of the increasing requirements of present
                 and future computation intensive applications, there
                 have been many fundamentally divergent approaches such
                 as the Blue-Gene, TRIPS, HERO, Cascade spurred in order
                 to provide increased performance at node level in
                 supercomputing clusters. The design of the node
                 architecture should be such that 'Cost-Effective
                 Supercomputing' is realized without compromising on the
                 requirements of the ever-performance hungry grand
                 challenge applications. However, to increase
                 performance at the cluster level, scalability and
                 likewise tackling the mapping complexity across the
                 large cluster of nodes becomes critical. The potential
                 of such a node architecture can be fully exploited only
                 with an appropriate cluster architecture. In an attempt
                 to address these issues for efficient and
                 Cost-Effective Supercomputing, we propose a novel
                 paradigm for designing High Performance Clusters, in
                 two papers. In paper-II, we discuss the design of
                 operating system and cluster architecture. In this
                 paper, we present a node architecture model based on
                 the Memory In Processor paradigm and discuss the
                 related architectural aspects (ISA, compiler, network
                 interconnection etc). We provide a design space based
                 on the proposed model for which a simulator is
                 developed, with the help of which the performance of
                 such a node architecture is outlined.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Venkateswaran:2007:FGSb,
  author =       "N. Venkateswaran and Deepak Srinivasan and Madhavan
                 Manivannan and T. P. Ramnath Sai Sagar and Shyamsundar
                 Gopalakrishnan and VinothKrishnan Elangovan and Arvind
                 M. and Prem Kumar Ramesh and Karthik Ganesan and
                 Viswanath Krishnamurthy and Sivaramakrishnan",
  title =        "Future generation supercomputers {II}: a paradigm for
                 cluster architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "5",
  pages =        "61--70",
  month =        dec,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1360464.1360467",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:51:13 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "In part-I, a novel multi-core node architecture was
                 proposed which when employed in a cluster environment
                 would be capable of tackling computational complexity
                 associated with wide class of applications.
                 Furthermore, it was discussed that by appropriately
                 scaling the architectural specifications, Teraops
                 computing power could be achieved at the node level. In
                 order to harness the computational power of such a
                 node, we have developed an efficient application
                 execution model with a competent cluster architectural
                 backbone. In this paper we present the novel cluster
                 paradigm, dealing with operating system design,
                 parallel programming model and cluster interconnection
                 network. Our approach in developing the competent
                 cluster design revolves around an execution model to
                 aid the execution of multiple applications
                 simultaneously on all partitions of the cluster,
                 leading to cost sharing across applications. This would
                 be a major initiative towards achieving Cost-Effective
                 Supercomputing.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Thorson:2007:INd,
  author =       "Mark Thorson",
  title =        "{Internet} nuggets",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "35",
  number =       "5",
  pages =        "71--73",
  month =        dec,
  year =         "2007",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1360464.1360477",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:51:13 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "This column consists of selected traffic from the
                 comp.arch newsgroup, a forum for discussion of computer
                 architecture on the Internet---an international
                 computer network.\par

                 As always, the opinions expressed in this column are
                 the personal views of the authors, and do not
                 necessarily represent the institutions to which they
                 are affiliated.\par

                 Text which sets the context of a message appears
                 underlined or in italics; this is usually text the
                 author has quoted from earlier messages. The code-like
                 expressions below the authors' names are their
                 addresses on Internet.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Winfree:2008:TMP,
  author =       "Erik Winfree",
  title =        "Toward molecular programming with {DNA}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "1",
  pages =        "1--1",
  month =        mar,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1353534.1346282",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:51:35 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Biological organisms are beautiful examples of
                 programming. The program and data are stored in
                 biological molecules such as DNA, RNA, and proteins;
                 the algorithms are carried out by molecular and
                 biochemical processes; and the end result is the
                 creation and function of an organism. If we understood
                 how to program molecular systems, what could we create?
                 Lifelike technologies whose basic operations are
                 chemical reactions? The fields of chemistry, physics,
                 biology, and computer science are converging as we
                 begin to synthesize molecules, molecular machines, and
                 molecular systems of ever increasing complexity,
                 leading to subdisciplines such as DNA nanotechnology,
                 DNA computing, and synthetic biology. Having
                 demonstrated simple devices and systems --
                 self-assembled structures, molecular motors, chemical
                 logic gates -- researchers are now turning to the
                 question of how to create large-scale integrated
                 systems. To do so, we must learn how to manage
                 complexity: how to efficiently specify the structure
                 and behavior of intricate molecular systems, how to
                 compile such specifications down to the design of
                 molecules to be synthesized in the lab, and how to
                 ensure that such systems function robustly. These
                 issues will be illustrated for chemical logic circuits
                 based on cascades of DNA hybridization reactions.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "DNA; molecular programming",
}

@Article{Chen:2008:OVB,
  author =       "Xiaoxin Chen and Tal Garfinkel and E. Christopher
                 Lewis and Pratap Subrahmanyam and Carl A. Waldspurger
                 and Dan Boneh and Jeffrey Dwoskin and Dan R. K. Ports",
  title =        "{Overshadow}: a virtualization-based approach to
                 retrofitting protection in commodity operating
                 systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "1",
  pages =        "2--13",
  month =        mar,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1346281.1346284",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:51:35 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Commodity operating systems entrusted with securing
                 sensitive data are remarkably large and complex, and
                 consequently, frequently prone to compromise. To
                 address this limitation, we introduce a
                 virtual-machine-based system called Overshadow that
                 protects the privacy and integrity of application data,
                 even in the event of a total OS compromise. Overshadow
                 presents an application with a normal view of its
                 resources, but the OS with an encrypted view. This
                 allows the operating system to carry out the complex
                 task of managing an application's resources, without
                 allowing it to read or modify them. Thus, Overshadow
                 offers a last line of defense for application
                 data.\par

                 Overshadow builds on multi-shadowing, a novel mechanism
                 that presents different views of 'physical' memory,
                 depending on the context performing the access. This
                 primitive offers an additional dimension of protection
                 beyond the hierarchical protection domains implemented
                 by traditional operating systems and processor
                 architectures.\par

                 We present the design and implementation of Overshadow
                 and show how its new protection semantics can be
                 integrated with existing systems. Our design has been
                 fully implemented and used to protect a wide range of
                 unmodified legacy applications running on an unmodified
                 Linux operating system. We evaluate the performance of
                 our implementation, demonstrating that this approach is
                 practical.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "cloaking; hypervisors; memory protection;
                 multi-shadowing; operating systems; virtual machine
                 monitors; VMM",
}

@Article{McCune:2008:HLC,
  author =       "Jonathan M. McCune and Bryan Parno and Adrian Perrig
                 and Michael K. Reiter and Arvind Seshadri",
  title =        "How low can you go?: recommendations for
                 hardware-supported minimal {TCB} code execution",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "1",
  pages =        "14--25",
  month =        mar,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1346281.1346285",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:51:35 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "We explore the extent to which newly available
                 CPU-based security technology can reduce the Trusted
                 Computing Base (TCB) for security-sensitive
                 applications. We find that although this new technology
                 represents a step in the right direction, significant
                 performance issues remain. We offer several suggestions
                 that leverage existing processor technology, retain
                 security, and improve performance. Implementing these
                 recommendations will finally allow application
                 developers to focus exclusively on the security of
                 their own code, enabling it to execute in isolation
                 from the numerous vulnerabilities in the underlying
                 layers of legacy code.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "late launch; secure execution; trusted computing",
}

@Article{Bhargava:2008:ATD,
  author =       "Ravi Bhargava and Benjamin Serebrin and Francesco
                 Spadini and Srilatha Manne",
  title =        "Accelerating two-dimensional page walks for
                 virtualized systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "1",
  pages =        "26--35",
  month =        mar,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1346281.1346286",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:51:35 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Nested paging is a hardware solution for alleviating
                 the software memory management overhead imposed by
                 system virtualization. Nested paging complements
                 existing page walk hardware to form a two-dimensional
                 (2D) page walk, which reduces the need for hypervisor
                 intervention in guest page table management. However,
                 the extra dimension also increases the maximum number
                 of architecturally-required page table
                 references.\par

                 This paper presents an in-depth examination of the 2D
                 page table walk overhead and options for decreasing it.
                 These options include using the AMD Opteron processor's
                 page walk cache to exploit the strong reuse of page
                 entry references. For a mix of server and SPEC
                 benchmarks, the presented results show a 15\%-38\%
                 improvement in guest performance by extending the
                 existing page walk cache to also store the nested
                 dimension of the 2D page walk. Caching nested page
                 table translations and skipping multiple page entry
                 references produce an additional 3\%-7\%
                 improvement.\par

                 Much of the remaining 2D page walk overhead is due to
                 low-locality nested page entry references, which result
                 in additional memory hierarchy misses. By using large
                 pages, the hypervisor can eliminate many of these
                 long-latency accesses and further improve the guest
                 performance by 3\%-22\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "AMD; hypervisor; memory management; nested paging;
                 page walk caching; TLB; virtual machine monitor;
                 virtualization",
}

@Article{Lee:2008:ETL,
  author =       "Benjamin C. Lee and David Brooks",
  title =        "Efficiency trends and limits from comprehensive
                 microarchitectural adaptivity",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "1",
  pages =        "36--47",
  month =        mar,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1353534.1346288",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:51:35 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Increasing demand for power-efficient,
                 high-performance computing requires tuning applications
                 and/or the underlying hardware to improve the mapping
                 between workload heterogeneity and computational
                 resources. To assess the potential benefits of hardware
                 tuning, we propose a framework that leverages
                 synergistic interactions between recent advances in (a)
                 sampling, (b) predictive modeling, and (c) optimization
                 heuristics. This framework enables qualitatively new
                 capabilities in analyzing the performance and power
                 characteristics of adaptive microarchitectures. For the
                 first time, we are able to simultaneously consider high
                 temporal and comprehensive spatial adaptivity. In
                 particular, we optimize efficiency for many, short
                 adaptive intervals and identify the best configuration
                 of 15 parameters, which define a space of 240B
                 point.\par

                 With frequent sub-application reconfiguration and a
                 fully reconfigurable hardware substrate, adaptive
                 microarchitectures achieve bips$^3$ /w efficiency gains
                 of up to 5.3x (median 2.4x) relative to their static
                 counterparts already optimized for a given application.
                 This 5.3x efficiency gain is derived from a 1.6x
                 performance gain and 0.8x power reduction. Although
                 several applications achieve a significant fraction of
                 their potential efficiency with as few as three
                 adaptive parameters, the three most significant
                 parameters differ across applications. These
                 differences motivate a hardware substrate capable of
                 comprehensive adaptivity to meet these diverse
                 application requirements.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "adaptivity; efficiency; inference; microarchitecture;
                 performance; power; reconfigurablity; regression;
                 simulation; statistics",
}

@Article{Raghavendra:2008:NPS,
  author =       "Ramya Raghavendra and Parthasarathy Ranganathan and
                 Vanish Talwar and Zhikui Wang and Xiaoyun Zhu",
  title =        "No 'power' struggles: coordinated multi-level power
                 management for the data center",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "1",
  pages =        "48--59",
  month =        mar,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1353534.1346289",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:51:35 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Power delivery, electricity consumption, and heat
                 management are becoming key challenges in data center
                 environments. Several past solutions have individually
                 evaluated different techniques to address separate
                 aspects of this problem, in hardware and software, and
                 at local and global levels. Unfortunately, there has
                 been no corresponding work on coordinating all these
                 solutions. In the absence of such coordination, these
                 solutions are likely to interfere with one another, in
                 unpredictable (and potentially dangerous) ways. This
                 paper seeks to address this problem. We make two key
                 contributions. First, we propose and validate a power
                 management solution that coordinates different
                 individual approaches. Using simulations based on 180
                 server traces from nine different real-world
                 enterprises, we demonstrate the correctness, stability,
                 and efficiency advantages of our solution. Second,
                 using our unified architecture as the base, we perform
                 a detailed quantitative sensitivity analysis and draw
                 conclusions about the impact of different
                 architectures, implementations, workloads, and system
                 design choices.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "capping; control theory; coordination; data center;
                 efficiency; power management; virtualization",
}

@Article{Ballapuram:2008:EAS,
  author =       "Chinnakrishnan S. Ballapuram and Ahmad Sharif and
                 Hsien-Hsin S. Lee",
  title =        "Exploiting access semantics and program behavior to
                 reduce snoop power in chip multiprocessors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "1",
  pages =        "60--69",
  month =        mar,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1346281.1346290",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:51:35 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Integrating more processor cores on-die has become the
                 unanimous trend in the microprocessor industry. Most of
                 the current research thrusts using chip multiprocessors
                 (CMPs) as the baseline to analyze problems in various
                 domains. One of the main design issues facing CMP
                 systems is the growing number of snoops required to
                 maintain cache coherency and to support
                 self/cross-modifying code that leads to power and
                 performance limitations. In this paper, we analyze the
                 internal and external snoop behavior in a CMP system
                 and relax the snoopy cache coherence protocol based on
                 the program semantics and properties of the shared
                 variables for saving power. Based on the observations
                 and analyses, we propose two novel techniques:
                 Selective Snoop Probe (SSP) and Essential Snoop Probe
                 (ESP) to reduce power without compromising performance.
                 Our simulation results show that using the SSP
                 technique, 5\% to 65\% data cache energy savings per
                 core for different processor configurations can be
                 achieved with 1\% to 2\% performance improvement. We
                 also show that 5\% to 82\% of data cache energy per
                 core is spent on the non-essential snoop probes that
                 can be saved using the ESP technique.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "chip multiprocessors; internal and external snoops;
                 MESI protocol; self-modifying code",
}

@Article{Mallik:2008:PMU,
  author =       "Arindam Mallik and Jack Cosgrove and Robert P. Dick
                 and Gokhan Memik and Peter Dinda",
  title =        "{PICSEL}: measuring user-perceived performance to
                 control dynamic frequency scaling",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "1",
  pages =        "70--79",
  month =        mar,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1353534.1346291",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:51:35 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "The ultimate goal of a computer system is to satisfy
                 its users. The success of architectural or system-level
                 optimizations depends largely on having accurate
                 metrics for user satisfaction. We propose to derive
                 such metrics from information that is 'close to flesh'
                 and apparent to the user rather than from information
                 that is 'close to metal' and hidden from the user. We
                 describe and evaluate PICSEL, a dynamic voltage and
                 frequency scaling (DVFS) technique that uses
                 measurements of variations in the rate of change of a
                 computer's video output to estimate user-perceived
                 performance. Our adaptive algorithms, one conservative
                 and one aggressive, use these estimates to dramatically
                 reduce operating frequencies and voltages for
                 graphically-intensive applications while maintaining
                 performance at a satisfactory level for the user. We
                 evaluate PICSEL through user studies conducted on a
                 Pentium M laptop running Windows XP. Experiments
                 performed with 20 users executing three applications
                 indicate that the measured laptop power can be reduced
                 by up to 12.1\%, averaged across all of our users and
                 applications, compared to the default Windows XP DVFS
                 policy. User studies revealed that the difference in
                 overall user satisfaction between the more aggressive
                 version of PICSEL and Windows DVFS were statistically
                 insignificant, whereas the conservative version of
                 PICSEL actually improved user satisfaction when
                 compared to Windows DVFS.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "dynamic voltage and frequency scaling; power
                 management; thermal emergency; user-perceived
                 performance",
}

@Article{Joao:2008:IPO,
  author =       "Jose A. Joao and Onur Mutlu and Hyesoon Kim and Rishi
                 Agarwal and Yale N. Patt",
  title =        "Improving the performance of object-oriented languages
                 with dynamic predication of indirect jumps",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "1",
  pages =        "80--90",
  month =        mar,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1353535.1346293",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:51:35 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Indirect jump instructions are used to implement
                 increasingly-common programming constructs such as
                 virtual function calls, switch-case statements, jump
                 tables, and interface calls. The performance impact of
                 indirect jumps is likely to increase because indirect
                 jumps with multiple targets are difficult to predict
                 even with specialized hardware.\par

                 This paper proposes a new way of handling
                 hard-to-predict indirect jumps: dynamically predicating
                 them. The compiler (static or dynamic) identifies
                 indirect jumps that are suitable for predication along
                 with their control-flow merge (CFM) points. The
                 hardware predicates the instructions between different
                 targets of the jump and its CFM point if the jump turns
                 out to be hard-to-predict at run time. If the jump
                 would actually have been mispredicted, its dynamic
                 predication eliminates a pipeline flush, thereby
                 improving performance.\par

                 Our evaluations show that Dynamic Indirect jump
                 Predication (DIP) improves the performance of a set of
                 object-oriented applications including the Java DaCapo
                 benchmark suite by 37.8\% compared to a commonly-used
                 branch target buffer based predictor, while also
                 reducing energy consumption by 24.8\%. We compare DIP
                 to three previously proposed indirect jump predictors
                 and find that it provides the best performance and
                 energy-efficiency.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "dynamic predication; indirect jumps; object-oriented
                 languages; predicated execution; virtual functions",
}

@Article{Wegiel:2008:MCV,
  author =       "Michal Wegiel and Chandra Krintz",
  title =        "The mapping collector: virtual memory support for
                 generational, parallel, and concurrent compaction",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "1",
  pages =        "91--102",
  month =        mar,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1353535.1346294",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:51:35 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Parallel and concurrent garbage collectors are
                 increasingly employed by managed runtime environments
                 (MREs) to maintain scalability, as multi-core
                 architectures and multi-threaded applications become
                 pervasive. Moreover, state-of-the-art MREs commonly
                 implement compaction to eliminate heap fragmentation
                 and enable fast linear object allocation.\par

                 Our empirical analysis of object demographics reveals
                 that unreachable objects in the heap tend to form
                 clusters large enough to be effectively managed at the
                 granularity of virtual memory pages. Even though
                 processes can manipulate the mapping of the virtual
                 address space through the standard operating system
                 (OS) interface on most platforms, extant
                 parallel/concurrent compactors do not do so to exploit
                 this clustering behavior and instead achieve compaction
                 by performing, relatively expensive, object moving and
                 pointer adjustment.\par

                 We introduce the Mapping Collector (MC), which
                 leverages virtual memory operations to reclaim and
                 consolidate free space without moving objects and
                 updating pointers. MC is a nearly-single-phase
                 compactor that is simpler and more efficient than
                 previously reported compactors that comprise two to
                 four phases. Through effective MRE-OS coordination, MC
                 maintains the simplicity of a non-moving collector
                 while providing efficient parallel and concurrent
                 compaction.\par

                 We implement both stop-the-world and concurrent MC in a
                 generational garbage collection framework within the
                 open-source HotSpot Java Virtual Machine. Our
                 experimental evaluation using a multiprocessor
                 indicates that MC significantly increases throughput
                 and scalability as well as reduces pause times,
                 relative to state-of-the-art, parallel and concurrent
                 compactors.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "compaction; concurrent; parallel; virtual memory",
}

@Article{Devietti:2008:HAS,
  author =       "Joe Devietti and Colin Blundell and Milo M. K. Martin
                 and Steve Zdancewic",
  title =        "{Hardbound}: architectural support for spatial safety
                 of the {C} programming language",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "1",
  pages =        "103--114",
  month =        mar,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1346281.1346295",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:51:35 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "The C programming language is at least as well known
                 for its absence of spatial memory safety guarantees
                 (i.e., lack of bounds checking) as it is for its high
                 performance. C's unchecked pointer arithmetic and array
                 indexing allow simple programming mistakes to lead to
                 erroneous executions, silent data corruption, and
                 security vulnerabilities. Many prior proposals have
                 tackled enforcing spatial safety in C programs by
                 checking pointer and array accesses. However, existing
                 software-only proposals have significant drawbacks that
                 may prevent wide adoption, including: unacceptably high
                 run-time overheads, lack of completeness, incompatible
                 pointer representations, or need for non-trivial
                 changes to existing C source code and compiler
                 infrastructure.\par

                 Inspired by the promise of these software-only
                 approaches, this paper proposes a hardware bounded
                 pointer architectural primitive that supports
                 cooperative hardware/software enforcement of spatial
                 memory safety for C programs. This bounded pointer is a
                 new hardware primitive datatype for pointers that
                 leaves the standard C pointer representation intact,
                 but augments it with bounds information maintained
                 separately and invisibly by the hardware. The bounds
                 are initialized by the software, and they are then
                 propagated and enforced transparently by the hardware,
                 which automatically checks a pointer's bounds before it
                 is dereferenced. One mode of use requires instrumenting
                 only malloc, which enables enforcement of
                 per-allocation spatial safety for heap-allocated
                 objects for existing binaries. When combined with
                 simple intraprocedural compiler instrumentation,
                 hardware bounded pointers enable a low-overhead
                 approach for enforcing complete spatial memory safety
                 in unmodified C programs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "C programming language; spatial memory safety",
}

@Article{Lvin:2008:ATA,
  author =       "Vitaliy B. Lvin and Gene Novark and Emery D. Berger
                 and Benjamin G. Zorn",
  title =        "{Archipelago}: trading address space for reliability
                 and security",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "1",
  pages =        "115--124",
  month =        mar,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1353535.1346296",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:51:35 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Memory errors are a notorious source of security
                 vulnerabilities that can lead to service interruptions,
                 information leakage and unauthorized access. Because
                 such errors are also difficult to debug, the absence of
                 timely patches can leave users vulnerable to attack for
                 long periods of time. A variety of approaches have been
                 introduced to combat these errors, but these often
                 incur large runtime overheads and generally abort on
                 errors, threatening availability.\par

                 This paper presents Archipelago, a runtime system that
                 takes advantage of available address space to
                 substantially reduce the likelihood that a memory error
                 will affect program execution. Archipelago randomly
                 allocates heap objects far apart in virtual address
                 space, effectively isolating each object from buffer
                 overflows. Archipelago also protects against dangling
                 pointer errors by preserving the contents of freed
                 objects after they are freed. Archipelago thus trades
                 virtual address space---a plentiful resource on 64-bit
                 systems---for significantly improved program
                 reliability and security, while limiting physical
                 memory consumption by tracking the working set of an
                 application and compacting cold objects. We show that
                 Archipelago allows applications to continue to run
                 correctly in the face of thousands of memory errors.
                 Across a suite of server applications, Archipelago's
                 performance overhead is 6\% on average (between -7\%
                 and 22\%), making it especially suitable to protect
                 servers that have known security vulnerabilities due to
                 heap memory errors.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "Archipelago; buffer overflow; dynamic memory
                 allocation; memory errors; probabilistic memory safety;
                 randomized algorithms; virtual memory",
}

@Article{Choi:2008:ABP,
  author =       "Bumyong Choi and Leo Porter and Dean M. Tullsen",
  title =        "Accurate branch prediction for short threads",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "1",
  pages =        "125--134",
  month =        mar,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1353534.1346298",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:51:35 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Multi-core processors, with low communication costs
                 and high availability of execution cores, will increase
                 the use of execution and compilation models that use
                 short threads to expose parallelism. Current branch
                 predictors seek to incorporate large amounts of control
                 flow history to maximize accuracy. However, when that
                 history is absent the predictor fails to work as
                 intended. Thus, modern predictors are almost useless
                 for threads below a certain length.\par

                 Using a Speculative Multithreaded (SpMT) architecture
                 as an example of a system which generates shorter
                 threads, this work examines techniques to improve
                 branch prediction accuracy when a new thread begins to
                 execute on a different core. This paper proposes a
                 minor change to the branch predictor that gives
                 virtually the same performance on short threads as an
                 idealized predictor that incorporates unknowable
                 pre-history of a spawned speculative thread. At the
                 same time, strong performance on long threads is
                 preserved. The proposed technique sets the global
                 history register of the spawned thread to the initial
                 value of the program counter. This novel and simple
                 design reduces branch mispredicts by 29\% and provides
                 as much as a 13\% IPC improvement on selected SPEC2000
                 benchmarks.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "branch prediction; chip multiprocessors",
}

@Article{Srikantaiah:2008:ASP,
  author =       "Shekhar Srikantaiah and Mahmut Kandemir and Mary Jane
                 Irwin",
  title =        "Adaptive set pinning: managing shared caches in chip
                 multiprocessors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "1",
  pages =        "135--144",
  month =        mar,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1353534.1346299",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:51:35 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "As part of the trend towards Chip Multiprocessors
                 (CMPs) for the next leap in computing performance, many
                 architectures have explored sharing the last level of
                 cache among different processors for better
                 performance-cost ratio and improved resource
                 allocation. Shared cache management is a crucial CMP
                 design aspect for the performance of the system. This
                 paper first presents a new classification of cache
                 misses - CII: Compulsory, Inter-processor and
                 Intra-processor misses - for CMPs with shared caches to
                 provide a better understanding of the interactions
                 between memory transactions of different processors at
                 the level of shared cache in a CMP. We then propose a
                 novel approach, called set pinning, for eliminating
                 inter-processor misses and reducing intra-processor
                 misses in a shared cache. Furthermore, we show that an
                 adaptive set pinning scheme improves over the benefits
                 obtained by the set pinning scheme by significantly
                 reducing the number of off-chip accesses. Extensive
                 analysis of these approaches with SPEComp 2001
                 benchmarks is performed using a full system simulator.
                 Our experiments indicate that the set pinning scheme
                 achieves an average improvement of 22.18\% in the L2
                 miss rate while the adaptive set pinning scheme reduces
                 the miss rates by an average of 47.94\% as compared to
                 the traditional shared cache scheme. They also improve
                 the performance by 7.24\% and 17.88\% respectively.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "CMP; inter-processor; intra-processor; set pinning;
                 shared cache",
}

@Article{Tuck:2008:SSE,
  author =       "James Tuck and Wonsun Ahn and Luis Ceze and Josep
                 Torrellas",
  title =        "{SoftSig}: software-exposed hardware signatures for
                 code analysis and optimization",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "1",
  pages =        "145--156",
  month =        mar,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1346281.1346300",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:51:35 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Many code analysis techniques for optimization,
                 debugging, or parallelization need to perform runtime
                 disambiguation of sets of addresses. Such operations
                 can be supported efficiently and with low complexity
                 with hardware signatures.\par

                 To enable flexible use of signatures, this paper
                 proposes to expose a Signature Register File to the
                 software through a rich ISA. The software has great
                 flexibility to decide, for each signature,which
                 addresses to collect and which addresses to
                 disambiguate against. We call this architecture
                 SoftSig. In addition, as an example of SoftSig use, we
                 show how to detect redundant function calls efficiently
                 and eliminate them dynamically. We call this algorithm
                 MemoiSE. On average for five popular applications,
                 MemoiSE reduces the number of dynamic instructions by
                 9.3\%, thereby reducing the execution time of the
                 applications by 9\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "memory disambiguation; multi-core architectures;
                 runtime optimization",
}

@Article{Burcea:2008:PV,
  author =       "Ioana Burcea and Stephen Somogyi and Andreas Moshovos
                 and Babak Falsafi",
  title =        "Predictor virtualization",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "1",
  pages =        "157--167",
  month =        mar,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1346281.1346301",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:51:35 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Many hardware optimizations rely on collecting
                 information about program behavior at runtime. This
                 information is stored in lookup tables. To be accurate
                 and effective, these optimizations usually require
                 large dedicated on-chip tables. Although technology
                 advances offer an increased amount of on-chip
                 resources, these resources are allocated to increase
                 the size of on-chip conventional cache
                 hierarchies.\par

                 This work proposes Predictor Virtualization, a
                 technique that uses the existing memory hierarchy to
                 emulate large predictor tables. We demonstrate the
                 benefits of this technique by virtualizing a
                 state-of-the-art data prefetcher. Full-system,
                 cycle-accurate simulations demonstrate that the
                 virtualized prefetcher preserves the performance
                 benefits of the original design, while reducing the
                 on-chip storage dedicated to the predictor table from
                 60KB down to less than one kilobyte.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "caches; memory hierarchy; metadata; predictor
                 virtualization",
}

@Article{Ganapathy:2008:DIM,
  author =       "Vinod Ganapathy and Matthew J. Renzelmann and Arini
                 Balakrishnan and Michael M. Swift and Somesh Jha",
  title =        "The design and implementation of microdrivers",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "1",
  pages =        "168--178",
  month =        mar,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1346281.1346303",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:51:35 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Device drivers commonly execute in the kernel to
                 achieve high performance and easy access to kernel
                 services. However, this comes at the price of decreased
                 reliability and increased programming difficulty.
                 Driver programmers are unable to use user-mode
                 development tools and must instead use cumbersome
                 kernel tools. Faults in kernel drivers can cause the
                 entire operating system to crash. User-mode drivers
                 have long been seen as a solution to this problem, but
                 suffer from either poor performance or new interfaces
                 that require a rewrite of existing drivers.\par

                 This paper introduces the Microdrivers architecture
                 that achieves high performance and compatibility by
                 leaving critical path code in the kernel and moving the
                 rest of the driver code to a user-mode process. This
                 allows data-handling operations critical to I/O
                 performance to run at full speed, while management
                 operations such as initialization and configuration run
                 at reduced speed in user-level. To achieve
                 compatibility, we present DriverSlicer, a tool that
                 splits existing kernel drivers into a kernel-level
                 component and a user-level component using a small
                 number of programmer annotations. Experiments show that
                 as much as 65\% of driver code can be removed from the
                 kernel without affecting common-case performance, and
                 that only 1-6 percent of the code requires
                 annotations.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "device drivers; program partitioning; reliability",
}

@Article{Weinsberg:2008:TFC,
  author =       "Yaron Weinsberg and Danny Dolev and Tal Anker and Muli
                 Ben-Yehuda and Pete Wyckoff",
  title =        "Tapping into the fountain of {CPUs}: on operating
                 system support for programmable devices",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "1",
  pages =        "179--188",
  month =        mar,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1346281.1346304",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:51:35 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "The constant race for faster and more powerful CPUs is
                 drawing to a close. No longer is it feasible to
                 significantly increase the speed of the CPU without
                 paying a crushing penalty in power consumption and
                 production costs. Instead of increasing single thread
                 performance, the industry is turning to multiple CPU
                 threads or cores (such as SMT and CMP) and
                 heterogeneous CPU architectures (such as the Cell
                 Broadband Engine). While this is a step in the right
                 direction, in every modern PC there is a wealth of
                 untapped compute resources. The NIC has a CPU; the disk
                 controller is programmable; some high-end graphics
                 adapters are already more powerful than host CPUs. Some
                 of these CPUs can perform some functions more
                 efficiently than the host CPUs. Our operating systems
                 and programming abstractions should be expanded to let
                 applications tap into these computational resources and
                 make the best use of them.\par

                 Therefore, we propose the H\par

                 YDRA framework, which lets application developers use
                 the combined power of every compute resource in a
                 coherent way. HYDRA is a programming model and a
                 runtime support layer which enables utilization of host
                 processors as well as various programmable peripheral
                 devices' processors. We present the framework and its
                 application for a demonstrative use-case, as well as
                 provide a thorough evaluation of its capabilities.
                 Using HYDRA we were able to cut down the development
                 cost of a system that uses multiple heterogeneous
                 compute resources significantly.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "offloading; operating systems; programming model",
}

@Article{Shen:2008:HCD,
  author =       "Kai Shen and Ming Zhong and Sandhya Dwarkadas and
                 Chuanpeng Li and Christopher Stewart and Xiao Zhang",
  title =        "Hardware counter driven on-the-fly request
                 signatures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "1",
  pages =        "189--200",
  month =        mar,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1346281.1346306",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:51:35 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Today's processors provide a rich source of
                 statistical information on application execution
                 through hardware counters. In this paper, we explore
                 the utilization of these statistics as request
                 signatures in server applications for identifying
                 requests and inferring high-level request properties (
                 e.g., CPU and I/O resource needs). Our key finding is
                 that effective request signatures may be constructed
                 using a small amount of hardware statistics while the
                 request is still in an early stage of its execution.
                 Such on-the-fly request identification and property
                 inference allow guided operating system adaptation at
                 request granularity ( e.g., resource-aware request
                 scheduling and on-the-fly request classification). We
                 address the challenges of selecting hardware counter
                 metrics for signature construction and providing
                 necessary operating system support for per-request
                 statistics management. Our implementation in the Linux
                 2.6.10 kernel suggests that our approach requires low
                 overhead suitable for runtime deployment. Our
                 on-the-fly request resource consumption inference
                 (averaging 7\%, 3\%, 20\%, and 41\% prediction errors
                 for four server workloads, TPC-C, TPC-H, J2EE-based
                 RUBiS, and a trace-driven index search, respectively)
                 is much more accurate than the online running-average
                 based prediction (73-82\% errors). Its use for
                 resource-aware request scheduling results in a 15-70\%
                 response time reduction for three CPU-bound
                 applications. Its use for on-the-fly request
                 classification and anomaly detection exhibits high
                 accuracy for the TPC-H workload with synthetically
                 generated anomalous requests following a typical
                 SQL-injection attack pattern.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "anomaly detection; hardware counter; operating system
                 adaptation; request classification; server system",
}

@Article{VanErtvelde:2008:DPA,
  author =       "Luk {Van Ertvelde} and Lieven Eeckhout",
  title =        "Dispersing proprietary applications as benchmarks
                 through code mutation",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "1",
  pages =        "201--210",
  month =        mar,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1353534.1346307",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:51:35 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Industry vendors hesitate to disseminate proprietary
                 applications to academia and third party vendors. By
                 consequence, the benchmarking process is typically
                 driven by standardized, open-source benchmarks which
                 may be very different from and likely not
                 representative of the real-life applications of
                 interest.\par

                 This paper proposes code mutation, a novel technique
                 that mutates a proprietary application to complicate
                 reverse engineering so that it can be distributed as a
                 benchmark. The benchmark mutant then serves as a proxy
                 for the proprietary application. The key idea in the
                 proposed code mutation approach is to preserve the
                 proprietary application's dynamic memory access and/or
                 control flow behavior in the benchmark mutant while
                 mutating the rest of the application code. To this end,
                 we compute program slices for memory access operations
                 and/or control flow operations trimmed through constant
                 value and branch profiles; and subsequently mutate the
                 instructions not appearing in these slices through
                 binary rewriting.\par

                 Our experimental results using SPEC CPU2000 and MiBench
                 benchmarks show that code mutation is a promising
                 technique that mutates up to 90\% of the static binary,
                 up to 50\% of the dynamically executed instructions,
                 and up to 35\% of the at run time exposed
                 inter-operation data dependencies. The performance
                 characteristics of the mutant are very similar to those
                 of the proprietary application across a wide range of
                 microarchitectures and hardware implementations.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "benchmark generation; code mutation",
}

@Article{Mysore:2008:UVF,
  author =       "Shashidhar Mysore and Bita Mazloom and Banit Agrawal
                 and Timothy Sherwood",
  title =        "Understanding and visualizing full systems with data
                 flow tomography",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "1",
  pages =        "211--221",
  month =        mar,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1353534.1346308",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:51:35 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "It is not uncommon for modern systems to be composed
                 of a variety of interacting services, running across
                 multiple machines in such a way that most developers do
                 not really understand the whole system. As abstraction
                 is layered atop abstraction, developers gain the
                 ability to compose systems of extraordinary complexity
                 with relative ease. However, many software properties,
                 especially those that cut across abstraction layers,
                 become very difficult to understand in such
                 compositions. The communication patterns involved, the
                 privacy of critical data, and the provenance of
                 information, can be difficult to find and understand,
                 even with access to all of the source code. The goal of
                 Data Flow Tomography is to use the inherent information
                 flow of such systems to help visualize the interactions
                 between complex and interwoven components across
                 multiple layers of abstraction. In the same way that
                 the injection of short-lived radioactive isotopes help
                 doctors trace problems in the cardiovascular system,
                 the use of 'data tagging' can help developers slice
                 through the extraneous layers of software and pin-point
                 those portions of the system interacting with the data
                 of interest. To demonstrate the feasibility of this
                 approach we have developed a prototype system in which
                 tags are tracked both through the machine and in
                 between machines over the network, and from which novel
                 visualizations of the whole system can be derived. We
                 describe the system-level challenges in creating a
                 working system tomography tool and we qualitatively
                 evaluate our system by examining several example real
                 world scenarios.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "data flow tracking; tomography; virtual machine",
}

@Article{Ottoni:2008:COG,
  author =       "Guilherme Ottoni and David I. August",
  title =        "Communication optimizations for global multi-threaded
                 instruction scheduling",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "1",
  pages =        "222--232",
  month =        mar,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1353535.1346310",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:51:35 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "The recent shift in the industry towards chip
                 multiprocessor (CMP) designs has brought the need for
                 multi-threaded applications to mainstream computing. As
                 observed in several limit studies, most of the
                 parallelization opportunities require looking for
                 parallelism beyond local regions of code. To exploit
                 these opportunities, especially for sequential
                 applications, researchers have recently proposed global
                 multi-threaded instruction scheduling techniques,
                 including DSWP and GREMIO. These techniques
                 simultaneously schedule instructions from large regions
                 of code, such as arbitrary loop nests or whole
                 procedures, and have been shown to be effective at
                 extracting threads for many applications. A key enabler
                 of these global instruction scheduling techniques is
                 the Multi-Threaded Code Generation (MTCG) algorithm
                 proposed in [16], which generates multi-threaded code
                 for any partition of the instructions into threads.
                 This algorithm inserts communication and
                 synchronization instructions in order to satisfy all
                 inter-thread dependences.\par

                 In this paper, we present a general compiler framework,
                 COCO, to optimize the communication and synchronization
                 instructions inserted by the MTCG algorithm. This
                 framework, based on thread-aware data-flow analyses and
                 graph min-cut algorithms, appropriately models and
                 optimizes all kinds of inter-thread dependences,
                 including register, memory, and control dependences.
                 Our experiments, using a fully automatic compiler
                 implementation of these techniques, demonstrate
                 significant reductions (about 30\% on average) in the
                 number of dynamic communication instructions in code
                 parallelized with DSWP and GREMIO. This reduction in
                 communication translates to performance gains of up to
                 40\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "communication; data-flow analysis; graph min-cut;
                 instruction scheduling; multi-threading;
                 synchronization",
}

@Article{Kulkarni:2008:OPB,
  author =       "Milind Kulkarni and Keshav Pingali and Ganesh
                 Ramanarayanan and Bruce Walter and Kavita Bala and L.
                 Paul Chew",
  title =        "Optimistic parallelism benefits from data
                 partitioning",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "1",
  pages =        "233--243",
  month =        mar,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1353534.1346311",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:51:35 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Recent studies of irregular applications such as
                 finite-element mesh generators and data-clustering
                 codes have shown that these applications have a
                 generalized data parallelism arising from the use of
                 iterative algorithms that perform computations on
                 elements of worklists. In some irregular applications,
                 the computations on different elements are independent.
                 In other applications, there may be complex patterns of
                 dependences between these computations.\par

                 The Galois system was designed to exploit this kind of
                 irregular data parallelism on multicore processors. Its
                 main features are (i) two kinds of set iterators for
                 expressing worklist-based data parallelism, and (ii) a
                 runtime system that performs optimistic parallelization
                 of these iterators, detecting conflicts and rolling
                 back computations as needed. Detection of conflicts and
                 rolling back iterations requires information from class
                 implementors.\par

                 In this paper, we introduce mechanisms to improve the
                 execution efficiency of Galois programs: data
                 partitioning, data-centric work assignment, lock
                 coarsening, and over-decomposition. These mechanisms
                 can be used to exploit locality of reference, reduce
                 mis-speculation, and lower synchronization overhead. We
                 also argue that the design of the Galois system permits
                 these mechanisms to be used with relatively little
                 modification to the user code. Finally, we present
                 experimental results that demonstrate the utility of
                 these mechanisms.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "data partitioning; irregular programs; locality; lock
                 coarsening; optimistic parallelism;
                 over-decomposition",
}

@Article{Cox:2008:XEO,
  author =       "Russ Cox and Tom Bergan and Austin T. Clements and
                 Frans Kaashoek and Eddie Kohler",
  title =        "{Xoc}, an extension-oriented compiler for systems
                 programming",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "1",
  pages =        "244--254",
  month =        mar,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1353535.1346312",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:51:35 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Today's system programmers go to great lengths to
                 extend the languages in which they program. For
                 instance, system-specific compilers find errors in
                 Linux and other systems, and add support for
                 specialized control flow to Qt and event-based
                 programs. These compilers are difficult to build and
                 cannot always understand each other's language changes.
                 However, they can greatly improve code
                 understandability and correctness, advantages that
                 should be accessible to all programmers.\par

                 We describe an extension-oriented compiler for C called
                 xoc. An extension-oriented compiler, unlike a
                 conventional extensible compiler, implements new
                 features via many small extensions that are loaded
                 together as needed. Xoc gives extension writers full
                 control over program syntax and semantics while hiding
                 many compiler internals. Xoc programmers concisely
                 define powerful compiler extensions that, by
                 construction, can be combined; even some parts of the
                 base compiler, such as GNU C compatibility, are
                 structured as extensions.\par

                 Xoc is based on two key interfaces. Syntax patterns
                 allow extension writers to manipulate language
                 fragments using concrete syntax. Lazy computation of
                 attributes allows extension writers to use the results
                 of analyses by other extensions or the core without
                 needing to worry about pass scheduling.\par

                 Extensions built using xoc include xsparse, a 345-line
                 extension that mimics Sparse, Linux's C front end, and
                 xlambda, a 170-line extension that adds function
                 expressions to C. An evaluation of xoc using these and
                 13 other extensions shows that xoc extensions are
                 typically more concise than equivalent extensions
                 written for conventional extensible compilers and that
                 it is possible to compose extensions.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "extension-oriented compilers",
}

@Article{Wells:2008:AIF,
  author =       "Philip M. Wells and Koushik Chakraborty and Gurindar
                 S. Sohi",
  title =        "Adapting to intermittent faults in multicore systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "1",
  pages =        "255--264",
  month =        mar,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1353536.1346314",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:51:35 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Future multicore processors will be more susceptible
                 to a variety of hardware failures. In particular,
                 intermittent faults, caused in part by manufacturing,
                 thermal, and voltage variations, can cause bursts of
                 frequent faults that last from several cycles to
                 several seconds or more. Due to practical limitations
                 of circuit techniques, cost-effective reliability will
                 likely require the ability to temporarily suspend
                 execution on a core during periods of intermittent
                 faults.\par

                 We investigate three of the most obvious techniques for
                 adapting to the dynamically changing resource
                 availability caused by intermittent faults, and
                 demonstrate their different system-level implications.
                 We show that system software reconfiguration has very
                 high overhead, that temporarily pausing execution on a
                 faulty core can lead to cascading livelock, and that
                 using spare cores has high fault-free cost. To remedy
                 these and other drawbacks of the three baseline
                 techniques, we propose using a thin hardware/firmware
                 layer to manage an overcommitted system -- one where
                 the OS is configured to use more virtual processors
                 than the number of currently available physical cores.
                 We show that this proposed technique can gracefully
                 degrade performance during intermittent faults of
                 various duration with low overhead, without involving
                 system software, and without requiring spare cores.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "intermittent faults; overcommitted system",
}

@Article{Li:2008:UPH,
  author =       "Man-Lap Li and Pradeep Ramachandran and Swarup Kumar
                 Sahoo and Sarita V. Adve and Vikram S. Adve and
                 Yuanyuan Zhou",
  title =        "Understanding the propagation of hard errors to
                 software and implications for resilient system design",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "1",
  pages =        "265--276",
  month =        mar,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1346281.1346315",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:51:35 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "With continued CMOS scaling, future shipped hardware
                 will be increasingly vulnerable to in-the-field faults.
                 To be broadly deployable, the hardware reliability
                 solution must incur low overheads, precluding use of
                 expensive redundancy. We explore a cooperative
                 hardware-software solution that watches for anomalous
                 software behavior to indicate the presence of hardware
                 faults. Fundamental to such a solution is a
                 characterization of how hardware faults indifferent
                 microarchitectural structures of a modern processor
                 propagate through the application and OS.\par

                 This paper aims to provide such a characterization,
                 resulting in identifying low-cost detection methods and
                 providing guidelines for implementation of the recovery
                 and diagnosis components of such a reliability
                 solution. We focus on hard faults because they are
                 increasingly important and have different system
                 implications than the much studied transients. We
                 achieve our goals through fault injection experiments
                 with a microarchitecture-level full system timing
                 simulator. Our main results are: (1) we are able to
                 detect 95\% of the unmasked faults in 7 out of 8
                 studied microarchitectural structures with simple
                 detectors that incur zero to little hardware overhead;
                 (2) over 86\% of these detections are within latencies
                 that existing hardware checkpointing schemes can
                 handle, while others require software checkpointing;
                 and (3) a surprisingly large fraction of the detected
                 faults corrupt OS state, but almost all of these are
                 detected with latencies short enough to use hardware
                 checkpointing, thereby enabling OS recovery in
                 virtually all such cases.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "architecture; error detection; fault injection;
                 permanent fault",
}

@Article{Suleman:2008:FDT,
  author =       "M. Aater Suleman and Moinuddin K. Qureshi and Yale N.
                 Patt",
  title =        "Feedback-driven threading: power-efficient and
                 high-performance execution of multi-threaded workloads
                 on {CMPs}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "1",
  pages =        "277--286",
  month =        mar,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1346281.1346317",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:51:35 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Extracting high-performance from the emerging Chip
                 Multiprocessors (CMPs) requires that the application be
                 divided into multiple threads. Each thread executes on
                 a separate core thereby increasing concurrency and
                 improving performance. As the number of cores on a CMP
                 continues to increase, the performance of some
                 multi-threaded applications will benefit from the
                 increased number of threads, whereas, the performance
                 of other multi-threaded applications will become
                 limited by data-synchronization and off-chip bandwidth.
                 For applications that get limited by
                 data-synchronization, increasing the number of threads
                 significantly degrades performance and increases
                 on-chip power. Similarly, for applications that get
                 limited by off-chip bandwidth, increasing the number of
                 threads increases on-chip power without providing any
                 performance improvement. Furthermore, whether an
                 application gets limited by data-synchronization, or
                 bandwidth, or neither depends not only on the
                 application but also on the input set and the machine
                 configuration. Therefore, controlling the number of
                 threads based on the run-time behavior of the
                 application can significantly improve performance and
                 reduce power.\par

                 This paper proposes Feedback-Driven Threading (FDT), a
                 framework to dynamically control the number of threads
                 using run-time information. FDT can be used to
                 implement Synchronization-Aware Threading (SAT), which
                 predicts the optimal number of threads depending on the
                 amount of data-synchronization. Our evaluation shows
                 that SAT can reduce both execution time and power by up
                 to 66\% and 78\% respectively. Similarly, FDT can be
                 used to implement Bandwidth-Aware Threading (BAT),
                 which predicts the minimum number of threads required
                 to saturate the off-chip bus. Our evaluation shows that
                 BAT reduces on-chip power by up to 78\%. When SAT and
                 BAT are combined, the average execution time reduces by
                 17\% and power reduces by 59\%. The proposed techniques
                 leverage existing performance counters and require
                 minimal support from the threading library.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "bandwidth; CMP; multi-threaded; synchronization",
}

@Article{Linderman:2008:MPM,
  author =       "Michael D. Linderman and Jamison D. Collins and Hong
                 Wang and Teresa H. Meng",
  title =        "{Merge}: a programming model for heterogeneous
                 multi-core systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "1",
  pages =        "287--296",
  month =        mar,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1346281.1346318",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:51:35 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "In this paper we propose the Merge framework, a
                 general purpose programming model for heterogeneous
                 multi-core systems. The Merge framework replaces
                 current ad hoc approaches to parallel programming on
                 heterogeneous platforms with a rigorous, library-based
                 methodology that can automatically distribute
                 computation across heterogeneous cores to achieve
                 increased energy and performance efficiency. The Merge
                 framework provides (1) a predicate dispatch-based
                 library system for managing and invoking function
                 variants for multiple architectures; (2) a high-level,
                 library-oriented parallel language based on map-reduce;
                 and (3) a compiler and runtime which implement the
                 map-reduce language pattern by dynamically selecting
                 the best available function implementations for a given
                 input and machine configuration. Using a generic
                 sequencer architecture interface for heterogeneous
                 accelerators, the Merge framework can integrate
                 function variants for specialized accelerators,
                 offering the potential for to-the-metal performance for
                 a wide range of heterogeneous architectures, all
                 transparent to the user. The Merge framework has been
                 prototyped on a heterogeneous platform consisting of an
                 Intel Core 2 Duo CPU and an 8-core 32-thread Intel
                 Graphics and Media Accelerator X3000, and a homogeneous
                 32-way Unisys SMP system with Intel Xeon processors. We
                 implemented a set of benchmarks using the Merge
                 framework and enhanced the library with X3000 specific
                 implementations, achieving speedups of 3.6x -- 8.5x
                 using the X3000 and 5.2x -- 22x using the 32-way system
                 relative to the straight C reference implementation on
                 a single IA32 core.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "GPGPU; heterogeneous multi-core; predicate dispatch",
}

@Article{Gummaraju:2008:SPG,
  author =       "Jayanth Gummaraju and Joel Coburn and Yoshio Turner
                 and Mendel Rosenblum",
  title =        "{Streamware}: programming general-purpose multicore
                 processors using streams",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "1",
  pages =        "297--307",
  month =        mar,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1346281.1346319",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:51:35 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Recently, the number of cores on general-purpose
                 processors has been increasing rapidly. Using
                 conventional programming models, it is challenging to
                 effectively exploit these cores for maximal
                 performance. An interesting alternative candidate for
                 programming multiple cores is the stream programming
                 model, which provides a framework for writing programs
                 in a sequential-style while greatly simplifying the
                 task of automatic parallelization. It has been shown
                 that not only traditional media/image applications but
                 also more general-purpose data-intensive applications
                 can be expressed in the stream programming
                 style.\par

                 In this paper, we investigate the potential to use the
                 stream programming model to efficiently utilize
                 commodity multicore general-purpose processors (e.g.,
                 Intel/AMD). Although several stream languages and
                 stream compilers have recently been developed, they
                 typically target special-purpose stream processors. In
                 contrast, we propose a flexible software system,
                 Streamware, which automatically maps stream programs
                 onto a wide variety of general-purpose multicore
                 processor configurations. We leverage existing
                 compilation framework for stream processors and design
                 a runtime environment which takes as input the output
                 of these stream compilers in the form of
                 machine-independent stream virtual machine code. The
                 runtime environment assigns work to processor cores
                 considering processor/cache configurations and adapts
                 to workload variations. We evaluate this approach for a
                 few general-purpose scientific applications on real
                 hardware and a cycle-level simulator set-up to showcase
                 scaling and contention issues. The results show that
                 the stream programming model is a good choice for
                 efficiently exploiting modern and future multicore CPUs
                 for an important class of applications.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "general-purpose multicore processors; programming;
                 runtime system; streams",
}

@Article{Nightingale:2008:PSC,
  author =       "Edmund B. Nightingale and Daniel Peek and Peter M.
                 Chen and Jason Flinn",
  title =        "Parallelizing security checks on commodity hardware",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "1",
  pages =        "308--318",
  month =        mar,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1346281.1346321",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:51:35 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Speck (Speculative Parallel Check) is a system that
                 accelerates powerful security checks on commodity
                 hardware by executing them in parallel on multiple
                 cores. Speck provides an infrastructure that allows
                 sequential invocations of a particular security check
                 to run in parallel without sacrificing the safety of
                 the system. Speck creates parallelism in two ways.
                 First, Speck decouples a security check from an
                 application by continuing the application, using
                 speculative execution, while the security check
                 executes in parallel on another core. Second, Speck
                 creates parallelism between sequential invocations of a
                 security check by running later checks in parallel with
                 earlier ones. Speck provides a process-level replay
                 system to deterministically and efficiently synchronize
                 state between a security check and the original
                 process. We use Speck to parallelize three security
                 checks: sensitive data analysis, on-access virus
                 scanning, and taint propagation. Running on a 4-core
                 and an 8-core computer, Speck improves performance 4x
                 and 7.5x for the sensitive data analysis check, 3.3x
                 and 2.8x for the on-access virus scanning check, and
                 1.6x and 2x for the taint propagation check.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "operating systems; parallel; performance; security;
                 speculative execution",
}

@Article{Castro:2008:BBR,
  author =       "Miguel Castro and Manuel Costa and Jean-Philippe
                 Martin",
  title =        "Better bug reporting with better privacy",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "1",
  pages =        "319--328",
  month =        mar,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1346281.1346322",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:51:35 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Software vendors collect bug reports from customers to
                 improve the quality of their software. These reports
                 should include the inputs that make the software fail,
                 to enable vendors to reproduce the bug. However,
                 vendors rarely include these inputs in reports because
                 they may contain private user data. We describe a
                 solution to this problem that provides software vendors
                 with new input values that satisfy the conditions
                 required to make the software follow the same execution
                 path until it fails, but are otherwise unrelated with
                 the original inputs. These new inputs allow vendors to
                 reproduce the bug while revealing less private
                 information than existing approaches. Additionally, we
                 provide a mechanism to measure the amount of
                 information revealed in an error report. This mechanism
                 allows users to perform informed decisions on whether
                 or not to submit reports. We implemented a prototype of
                 our solution and evaluated it with real errors in real
                 programs. The results show that we can produce error
                 reports that allow software vendors to reproduce bugs
                 while revealing almost no private information.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "bug reports; constraint solving; privacy; symbolic
                 execution",
}

@Article{Lu:2008:LMC,
  author =       "Shan Lu and Soyeon Park and Eunsoo Seo and Yuanyuan
                 Zhou",
  title =        "Learning from mistakes: a comprehensive study on real
                 world concurrency bug characteristics",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "1",
  pages =        "329--339",
  month =        mar,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1353536.1346323",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jun 17 11:51:35 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "The reality of multi-core hardware has made concurrent
                 programs pervasive. Unfortunately, writing correct
                 concurrent programs is difficult. Addressing this
                 challenge requires advances in multiple directions,
                 including concurrency bug detection, concurrent program
                 testing, concurrent programming model design, etc.
                 Designing effective techniques in all these directions
                 will significantly benefit from a deep understanding of
                 real world concurrency bug characteristics.\par

                 This paper provides the first (to the best of our
                 knowledge) comprehensive real world concurrency bug
                 characteristic study. Specifically, we have carefully
                 examined concurrency bug patterns, manifestation, and
                 fix strategies of 105 randomly selected real world
                 concurrency bugs from 4 representative server and
                 client open-source applications (MySQL, Apache, Mozilla
                 and OpenOffice). Our study reveals several interesting
                 findings and provides useful guidance for concurrency
                 bug detection, testing, and concurrent programming
                 language design.\par

                 Some of our findings are as follows: (1) Around one
                 third of the examined non-deadlock concurrency bugs are
                 caused by violation to programmers' order intentions,
                 which may not be easily expressed via synchronization
                 primitives like locks and transactional memories; (2)
                 Around 34\% of the examined non-deadlock concurrency
                 bugs involve multiple variables, which are not well
                 addressed by existing bug detection tools; (3) About
                 92\% of the examined concurrency bugs can be reliably
                 triggered by enforcing certain orders among no more
                 than 4 memory accesses. This indicates that testing
                 concurrent programs can target at exploring possible
                 orders among every small groups of memory accesses,
                 instead of among all memory accesses; (4) About 73\% of
                 the examined non-deadlock concurrency bugs were not
                 fixed by simply adding or changing locks, and many of
                 the fixes were not correct at the first try, indicating
                 the difficulty of reasoning concurrent execution by
                 programmers.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "bug characteristics; concurrency bug; concurrent
                 program",
}

@Article{Anonymous:2008:MGC,
  author =       "Anonymous",
  title =        "Message from the {General Chairs}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "3",
  pages =        "x--x",
  month =        jun,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1394608.1382166",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Aug 6 08:35:03 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Anonymous:2008:MPC,
  author =       "Anonymous",
  title =        "Message from the {Program Chair}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "3",
  pages =        "xi--xi",
  month =        jun,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1394608.1382167",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Aug 6 08:35:03 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Anonymous:2008:R,
  author =       "Anonymous",
  title =        "Reviewers",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "3",
  pages =        "xv--xviii",
  month =        jun,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1394608.1382168",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Aug 6 08:35:03 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Tseng:2008:AOP,
  author =       "Francis Tseng and Yale N. Patt",
  title =        "Achieving Out-of-Order Performance with Almost
                 In-Order Complexity",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "3",
  pages =        "3--12",
  month =        jun,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1394608.1382169",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Aug 6 08:35:03 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "There is still much performance to be gained by
                 out-of-order processors with wider issue widths.
                 However, traditional methods of increasing issue width
                 do not scale; that is, they drastically increase design
                 complexity and power requirements. This paper
                 introduces the braid, a compile-time identified entity
                 that enables the execution core to scale to wider
                 widths by exploiting the small fanout and short
                 lifetime of values produced by the program. Braid
                 processing requires identification by the compiler,
                 minor extensions to the ISA, and support by the
                 microarchitecture. The result from processing braids is
                 performance within 9\% of a very aggressive
                 conventional out-of-order microarchitecture with almost
                 the complexity of an in-order implementation.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Agarwal:2008:FCR,
  author =       "Mayank Agarwal and Nitin Navale and Kshitiz Malik and
                 Matthew I. Frank",
  title =        "Fetch-Criticality Reduction through Control
                 Independence",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "3",
  pages =        "13--24",
  month =        jun,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1109/ISCA.2008.39",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Aug 6 08:35:03 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Architectures that exploit control independence (CI)
                 promise to remove in-order fetch bottlenecks, like
                 branch mispredicts, instruction-cache misses and fetch
                 unit stalls, from the critical path of single-threaded
                 execution. By exposing more fetch options, however, CI
                 architectures also expose more performance tradeoffs.
                 These tradeoffs make it hard to design policies that
                 deliver good performance. This paper presents a
                 criticality-based model for reasoning about CI
                 architectures, and uses that model to describe the
                 tradeoffs between gains from control independence
                 versus increased costs of honoring data dependences.
                 The model is then used to derive the design of a
                 criticality-aware task selection policy that strikes
                 the right balance between fetch-criticality and
                 execute-criticality. Finally, the paper validates the
                 model by attacking branch-misprediction induced
                 fetch-criticality through the above derived spawn
                 policy. This leads to as high as 100\% improvements in
                 performance, and in the region of 40\% or more
                 improvements for four of the benchmarks where this is
                 the main problem. Criticality analysis shows that this
                 improvement arises due to reduced fetch-criticality.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "control independence; fetch-criticality; implicit
                 parallelization",
}

@Article{Pericas:2008:TLL,
  author =       "Miquel Peric{\`a}s and Adrian Cristal and Francisco J.
                 Cazorla and Ruben Gonz{\'a}lez and Alex Veidenbaum and
                 Daniel A. Jim{\'e}nez and Mateo Valero",
  title =        "A Two-Level Load\slash Store Queue Based on Execution
                 Locality",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "3",
  pages =        "25--36",
  month =        jun,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1109/ISCA.2008.10",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Aug 6 08:35:03 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Multicore processors have emerged as a powerful
                 platform on which to efficiently exploit thread-level
                 parallelism (TLP). However, due to Amdahl's Law, such
                 designs will be increasingly limited by the remaining
                 sequential components of applications. To overcome this
                 limitation it is necessary to design processors with
                 many lower-performance cores for TLP and some
                 high-performance cores designed to execute sequential
                 algorithms. Such cores will need to address the
                 memory-wall by implementing kilo-instruction windows.
                 Large window processors require large Load/Store Queues
                 that would be too slow if implemented using current
                 CAM-based designs. This paper proposes an Epoch-based
                 Load Store Queue (ELSQ), a new design based on
                 Execution Locality. It is integrated into a
                 large-window processor that has a fast, out-of-order
                 core operating only on L1/L2 cache hits and N slower
                 cores that process L2 misses and their dependent
                 instructions. The large LSQ is coupled with the slow
                 cores and is partitioned into N small and local LSQs,
                 one per core. We evaluate ELSQ in a large-window
                 environment, finding that it enables high performance
                 at low power. By exploiting locality among loads and
                 stores, ELSQ outperforms even an idealized central LSQ
                 when implemented on top of a decoupled processor
                 design.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "execution locality; kilo-instruction processors;
                 load/store queue; power-efficiency",
}

@Article{Ipek:2008:SOM,
  author =       "Engin Ipek and Onur Mutlu and Jos{\'e} F.
                 Mart{\'\i}nez and Rich Caruana",
  title =        "Self-Optimizing Memory Controllers: a Reinforcement
                 Learning Approach",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "3",
  pages =        "39--50",
  month =        jun,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1394608.1382172",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Aug 6 08:35:03 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Efficiently utilizing off-chip DRAM bandwidth is a
                 critical issue in designing cost-effective,
                 high-performance chip multiprocessors(CMPs).
                 Conventional memory controllers deliver relatively low
                 performance in part because they often employ
                 fixed,rigid access scheduling policies designed for
                 average-case application behavior. As a result, they
                 cannot learn and optimize the long-term performance
                 impact of their scheduling decisions,and cannot adapt
                 their scheduling policies to dynamic workload behavior.
                 We propose a new, self-optimizing memory controller
                 design that operates using the principles of
                 reinforcement learning (RL)to overcome these
                 limitations. Our RL-based memory controller observes
                 the system state and estimates the long-term
                 performance impact of each action it can take. In this
                 way, the controller learns to optimize its scheduling
                 policy on the fly to maximize long-term performance.
                 Our results show that an RL-based memory controller
                 improves the performance of a set of parallel
                 applications run on a 4-core CMP by 19\% on average
                 (upto 33\%), and it improves DRAM bandwidth utilization
                 by 22\%compared to a state-of-the-art controller.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "chip multiprocessors; machine learning; memory
                 controller; memory systems; reinforcement learning",
}

@Article{Thoziyoor:2008:CMM,
  author =       "Shyamkumar Thoziyoor and Jung Ho Ahn and Matteo
                 Monchiero and Jay B. Brockman and Norman P. Jouppi",
  title =        "A Comprehensive Memory Modeling Tool and Its
                 Application to the Design and Analysis of Future Memory
                 Hierarchies",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "3",
  pages =        "51--62",
  month =        jun,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1109/ISCA.2008.16",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Aug 6 08:35:03 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "In this paper we introduce CACTI-D, a significant
                 enhancement of CACTI 5.0. CACTI-D adds support for
                 modeling of commodity DRAM technology and support for
                 main memory DRAM chip organization. CACTI-D enables
                 modeling of the complete memory hierarchy with
                 consistent models all the way from SRAM based L1 caches
                 through main memory DRAMs on DIMMs. We illustrate the
                 potential applicability of CACTI-D in the design and
                 analysis of future memory hierarchies by carrying out a
                 last level cache study for a multicore multithreaded
                 architecture at the 32nm technology node. In this study
                 we use CACTI-D to model all components of the memory
                 hierarchy including L1, L2, last level SRAM, logic
                 process based DRAM or commodity DRAM L3 caches, and
                 main memory DRAM chips. We carry out architectural
                 simulation using benchmarks with large data sets and
                 present results of their execution time, breakdown of
                 power in the memory hierarchy, and system energy-delay
                 product for the different system configurations. We
                 find that commodity DRAM technology is most attractive
                 for stacked last level caches, with significantly lower
                 energy-delay products.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "cache; CACTI; commodity DRAM; LLC; logic-process based
                 DRAM; SRAM",
}

@Article{Mutlu:2008:PAB,
  author =       "Onur Mutlu and Thomas Moscibroda",
  title =        "Parallelism-Aware Batch Scheduling: Enhancing both
                 Performance and Fairness of Shared {DRAM} Systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "3",
  pages =        "63--74",
  month =        jun,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1394608.1382128",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Aug 6 08:35:03 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "In a chip-multiprocessor (CMP) system, the DRAM system
                 is shared among cores. In a shared DRAM system,
                 requests from a thread can not only delay requests from
                 other threads by causing bank/bus/row-buffer conflicts
                 but they can also destroy other threads'
                 DRAM-bank-level parallelism. Requests whose latencies
                 would otherwise have been overlapped could effectively
                 become serialized. As a result both fairness and system
                 throughput degrade, and some threads can starve for
                 long time periods. This paper proposes a fundamentally
                 new approach to designing a shared DRAM controller that
                 provides quality of service to threads,while also
                 improving system throughput. Our parallelism-aware
                 batch scheduler (PAR-BS) design is based on two key
                 ideas. First, PARBS processes DRAM requests in batches
                 to provide fairness and to avoid starvation of
                 requests. Second, to optimize system throughput,PAR-BS
                 employs a parallelism-aware DRAM scheduling policy that
                 aims to process requests from a thread in parallel in
                 the DRAM banks, thereby reducing the memory-related
                 stall-time experienced by the thread. PAR-BS seamlessly
                 incorporates support for system-level thread priorities
                 and can provide different service levels, including
                 purely opportunistic service, to threads with different
                 priorities. We evaluate the design trade-offs involved
                 in PAR-BS and compare it to four previously proposed
                 DRAM scheduler designs on 4-, 8-, and16-core systems.
                 Our evaluations show that, averaged over 100 4-core
                 workloads, PAR-BS improves fairness by 1.11X and system
                 throughput by 8.3\% compared to the best previous
                 scheduling technique, Stall-Time Fair Memory (STFM)
                 scheduling. Based on simple request prioritization
                 rules, PAR-BS is also simpler to implement than STFM.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "chip multiprocessors; DRAM systems; fairness; memory
                 scheduling; memory systems; memory-level parallelism;
                 multi-core systems; quality of service",
}

@Article{Kim:2008:TDH,
  author =       "John Kim and William J. Dally and Steve Scott and
                 Dennis Abts",
  title =        "Technology-Driven, Highly-Scalable {Dragonfly}
                 Topology",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "3",
  pages =        "77--88",
  month =        jun,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1109/ISCA.2008.19",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Aug 6 08:35:03 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Evolving technology and increasing pin-bandwidth
                 motivate the use of high-radix routers to reduce the
                 diameter, latency, and cost of interconnection
                 networks. High-radix networks, however, require longer
                 cables than their low-radix counterparts. Because
                 cables dominate network cost, the number of cables, and
                 particularly the number of long, global cables should
                 be minimized to realize an efficient network. In this
                 paper, we introduce the dragonfly topology which uses a
                 group of high-radix routers as a virtual router to
                 increase the effective radix of the network. With this
                 organization, each minimally routed packet traverses at
                 most one global channel. By reducing global channels, a
                 dragonfly reduces cost by 20\% compared to a flattened
                 butterfly and by 52\% compared to a folded Clos network
                 in configurations with $ \geq $ 16K nodes. We also
                 introduce two new variants of global adaptive routing
                 that enable load-balanced routing in the dragonfly.
                 Each router in a dragonfly must make an adaptive
                 routing decision based on the state of a global channel
                 connected to a different router. Because of the
                 indirect nature of this routing decision, conventional
                 adaptive routing algorithms give degraded performance.
                 We introduce the use of selective virtual-channel
                 discrimination and the use of credit round-trip latency
                 to both sense and signal channel congestion. The
                 combination of these two methods gives throughput and
                 latency that approaches that of an ideal adaptive
                 routing algorithm.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "dragonfly; interconnection networks; topology",
}

@Article{Lee:2008:GSF,
  author =       "Jae W. Lee and Man Cheuk Ng and Krste Asanovic",
  title =        "Globally-Synchronized Frames for Guaranteed
                 Quality-of-Service in On-Chip Networks",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "3",
  pages =        "89--100",
  month =        jun,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1394608.1382130",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Aug 6 08:35:03 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Future chip multiprocessors (CMPs) may have hundreds
                 to thousands of threads competing to access shared
                 resources, and will require quality-of-service (QoS)
                 support to improve system utilization. Although there
                 has been significant work in QoS support within
                 resources such as caches and memory controllers, there
                 has been less attention paid to QoS support in the
                 multi-hop on-chip networks that will form an important
                 component in future systems. In this paper we introduce
                 Globally-Synchronized Frames (GSF), a framework for
                 providing guaranteed QoS in on-chip networks in terms
                 of minimum bandwidth and a maximum delay bound. The GSF
                 framework can be easily integrated in a conventional
                 virtual channel (VC) router without significantly
                 increasing the hardware complexity. We rely on a fast
                 barrier network, which is feasible in an on-chip
                 environment, to efficiently implement GSF. Performance
                 guarantees are verified by both analysis and
                 simulation. According to our simulations, all
                 concurrent flows receive their guaranteed minimum share
                 of bandwidth in compliance with a given bandwidth
                 allocation. The average throughput degradation of GSF
                 on a 8x8 mesh network is within 10\% compared to the
                 conventional best-effort VC router in most cases.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "chip multiprocessors; interconnects; multicores;
                 on-chip network; quality-of-service; resource
                 management; router; software interface",
}

@Article{Kim:2008:PCN,
  author =       "Martha Mercaldi Kim and John D. Davis and Mark Oskin
                 and Todd Austin",
  title =        "Polymorphic On-Chip Networks",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "3",
  pages =        "101--112",
  month =        jun,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1109/ISCA.2008.25",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Aug 6 08:35:03 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "As the number of cores per die increases, be they
                 processors, memory blocks, or custom accelerators, the
                 on-chip interconnect the cores use to communicate gains
                 importance. We begin this study with an
                 area-performance analysis of the interconnect design
                 space. We find that there is no single network design
                 that yields optimal performance across a range of
                 traffic patterns. This indicates that there is an
                 opportunity to gain performance by customizing the
                 interconnect to a particular application or workload.
                 We propose polymorphic on-chip networks to enable
                 per-application network customization. This network can
                 be configured prior to application runtime, to have the
                 topology and buffering of arbitrary network designs.
                 This paper proposes one such polymorphic network
                 architecture. We demonstrate its modes of
                 configurability, and evaluate the polymorphic network
                 architecture design space, producing polymorphic
                 fabrics that minimize the network area overhead.
                 Finally, we expand the network on chip design space to
                 include a polymorphic network design, showing that a
                 single polymorphic network is capable of implementing
                 all of the pareto optimal fixed-network designs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "configurable hardware; on-chip network",
}

@Article{Baugh:2008:UHM,
  author =       "Lee Baugh and Naveen Neelakantam and Craig Zilles",
  title =        "Using Hardware Memory Protection to Build a
                 High-Performance, Strongly-Atomic Hybrid Transactional
                 Memory",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "3",
  pages =        "115--126",
  month =        jun,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1394608.1382132",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Aug 6 08:35:03 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "We demonstrate how fine-grained memory protection can
                 be used in support of transactional memory systems:
                 first showing how a software transactional memory
                 system (STM) can be made strongly atomic by using
                 memory protection on transactionally-held state, then
                 showing how such a strongly-atomic STM can be used with
                 a bounded hardware TM system to build a hybrid TM
                 system in which zero-overhead hardware transactions may
                 safely run concurrently with potentially-conflicting
                 software transactions. We experimentally demonstrate
                 how this hybrid TM organization avoids the common-case
                 overheads associated with previous hybrid TM proposals,
                 achieving performance rivaling an unbounded HTM system
                 without the hardware complexity of ensuring completion
                 of arbitrary transactions in hardware. As part of our
                 findings, we identify key policies regarding contention
                 management within and across the hardware and software
                 TM components that are key to achieving robust
                 performance with a hybrid TM.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "abort handler; hybrid; memory protection; primitives;
                 strong atomicity; transactional memory",
}

@Article{Bobba:2008:TEE,
  author =       "Jayaram Bobba and Neelam Goyal and Mark D. Hill and
                 Michael M. Swift and David A. Wood",
  title =        "{TokenTM}: Efficient Execution of Large Transactions
                 with Hardware Transactional Memory",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "3",
  pages =        "127--138",
  month =        jun,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1394608.1382133",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Aug 6 08:35:03 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Current hardware transactional memory systems seek to
                 simplify parallel programming, but assume that large
                 transactions are rare, so it is acceptable to penalize
                 their performance or concurrency. However, future
                 programmers may wish to use large transactions more
                 often in order to integrate with higher-level
                 programming models (e.g., database transactions) or
                 perform selected I/O operations. To prevent the 'small
                 transactions are common' assumption from becoming
                 self-fulfilling, this paper contributes TokenTM --- an
                 unbounded HTM that uses the abstraction of tokens to
                 precisely track conflicts on an unbounded number of
                 memory blocks. TokenTM implements tokens with new
                 mechanisms, including metastate fission/fusion and fast
                 token release. TokenTM executes small transactions
                 fast, executes concurrent large transactions with no
                 penalty to nonconflicting transactions, and gracefully
                 handles paging, context switching, and System-V-style
                 shared memory.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "coherence protocols; hardware transactional memory;
                 metastates; tokens; transactional memory; unbounded
                 transactions",
}

@Article{Shriraman:2008:FDT,
  author =       "Arrvindh Shriraman and Sandhya Dwarkadas and Michael
                 L. Scott",
  title =        "Flexible Decoupled Transactional Memory Support",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "3",
  pages =        "139--150",
  month =        jun,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1109/ISCA.2008.17",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Aug 6 08:35:03 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "A high-concurrency transactional memory (TM)
                 implementation needs to track concurrent accesses,
                 buffer speculative updates, and manage conflicts. We
                 present a system, FlexTM (FLEXible Transactional
                 Memory), that coordinates four decoupled hardware
                 mechanisms: read and write signatures, which summarize
                 per-thread access sets; per-thread conflict summary
                 tables (CSTs), which identify the threads with which
                 conflicts have occurred; Programmable Data Isolation,
                 which maintains speculative updates in the local cache
                 and employs a thread-private buffer (in virtual memory)
                 in the rare event of overflow; and Alert-On-Update,
                 which selectively notifies threads about coherence
                 events. All mechanisms are software-accessible, to
                 enable virtualization and to support transactions of
                 arbitrary length. FlexTM allows software to determine
                 when to manage conflicts (either eagerly or lazily),
                 and to employ a variety of conflict management and
                 commit protocols. We describe an STM-inspired protocol
                 that uses CSTs to manage conflicts in a distributed
                 manner (no global arbitration) and allows parallel
                 commits. In experiments with a prototype on
                 Simics/GEMS, FlexTM exhibits 5x speedup over
                 high-quality software TM, with no loss in policy
                 flexibility. Its distributed commit protocol is also
                 more efficient than a central hardware manager. Our
                 results highlight the importance of flexibility in
                 determining when to manage conflicts: lazy maximizes
                 concurrency and helps to ensure forward progress while
                 eager provides better overall utilization in a
                 multi-programmed system.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "cache coherence; Conflict detection; FlexTM; Hardware;
                 Multiprocessors; RTM; Transactional memory",
}

@Article{Vantrease:2008:CSI,
  author =       "Dana Vantrease and Robert Schreiber and Matteo
                 Monchiero and Moray McLaren and Norman P. Jouppi and
                 Marco Fiorentino and Al Davis and Nathan Binkert and
                 Raymond G. Beausoleil and Jung Ho Ahn",
  title =        "{Corona}: System Implications of Emerging Nanophotonic
                 Technology",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "3",
  pages =        "153--164",
  month =        jun,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1394608.1382135",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Aug 6 08:35:03 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "We expect that many-core microprocessors will push
                 performance per chip from the 10 gigaflop to the 10
                 teraflop range in the coming decade. To support this
                 increased performance, memory and inter-core bandwidths
                 will also have to scale by orders of magnitude. Pin
                 limitations, the energy cost of electrical signaling,
                 and the non-scalability of chip-length global wires are
                 significant bandwidth impediments. Recent developments
                 in silicon nanophotonic technology have the potential
                 to meet these off- and on-stack bandwidth requirements
                 at acceptable power levels. Corona is a 3D many-core
                 architecture that uses nanophotonic communication for
                 both inter-core communication and off-stack
                 communication to memory or I/O devices. Its peak
                 floating-point performance is 10 teraflops. Dense
                 wavelength division multiplexed optically connected
                 memory modules provide 10 terabyte per second memory
                 bandwidth. A photonic crossbar fully interconnects its
                 256 low-power multithreaded cores at 20 terabyte per
                 second bandwidth. We have simulated a 1024 thread
                 Corona system running synthetic benchmarks and scaled
                 versions of the SPLASH-2 benchmark suite. We believe
                 that in comparison with an electrically-connected
                 many-core alternative that uses the same on-stack
                 interconnect power, Corona can provide 2 to 6 times
                 more performance on many memory intensive workloads,
                 while simultaneously reducing power.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "3D stacking; many-core CMP; nanophotonics; on-chip
                 Networks",
}

@Article{Kreger-Stickles:2008:MAI,
  author =       "Lucas Kreger-Stickles and Mark Oskin",
  title =        "Microcoded Architectures for Ion-Tap Quantum
                 Computers",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "3",
  pages =        "165--176",
  month =        jun,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1394608.1382136",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Aug 6 08:35:03 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "In this paper we present the first ever systematic
                 design space exploration of microcoded software fault
                 tolerant ion-trap quantum computers. This exploration
                 reveals the critical importance of a well-tuned
                 microcode for providing high performance and ensuring
                 system reliability. In addition, we find that, despite
                 recent advances in the reliability of quantum memory,
                 the impact of errors due to stored quantum data is now,
                 and will continue to be, a major source of systemic
                 error. Finally, our exploration reveals a single design
                 which out performs all others we considered in run
                 time, fidelity and area. For completeness our design
                 space exploration includes designs from prior work and
                 we find a novel design that is 1/2 the size, 3 times as
                 fast, and an order of magnitude more reliable.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "architecture; ion-trap; microcoded; quantum",
}

@Article{Isailovic:2008:RQC,
  author =       "Nemanja Isailovic and Mark Whitney and Yatish Patel
                 and John Kubiatowicz",
  title =        "Running a Quantum Circuit at the Speed of Data",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "3",
  pages =        "177--188",
  month =        jun,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1394608.1382137",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Aug 6 08:35:03 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "We analyze circuits for kernels from popular quantum
                 computing applications, characterizing the hardware
                 resources necessary to take ancilla preparation off the
                 critical path. The result is a chip entirely dominated
                 by ancilla generation circuits. To address this issue,
                 we introduce optimized ancilla factories and analyze
                 their structure and physical layout for ion trap
                 technology. We introduce a new quantum computing
                 architecture with highly concentrated data-only regions
                 surrounded by shared ancilla factories. The results are
                 a reduced dependence on costly teleportation, more
                 efficient distribution of generated ancillae and more
                 than five times speedup over previous proposals.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "ancilla factory; microarchitecture; quantum",
}

@Article{Liang:2008:RVT,
  author =       "Xiaoyao Liang and Gu-Yeon Wei and David Brooks",
  title =        "{ReVIVaL}: a Variation-Tolerant Architecture Using
                 Voltage Interpolation and Variable Latency",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "3",
  pages =        "191--202",
  month =        jun,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1394608.1382138",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Aug 6 08:35:03 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Process variations are poised to significantly degrade
                 performance benefits sought by moving to the next
                 nanoscale technology node. Parameter fluctuations in
                 devices can introduce large variations in peak
                 operation among chips, among cores on a single chip,
                 and among microarchitectural blocks within one core.
                 Hence, it will be difficult to only rely on traditional
                 frequency binning to efficiently cover the large
                 variations that are expected. Furthermore, multiple
                 voltage/frequency domains introduce significant
                 hardware overhead and alone cannot address the full
                 extent of delay variations expected in future
                 multi-core systems. In this paper, we present ReVIVaL,
                 which combines two fine-grained post-fabrication tuning
                 techniques---voltage interpolation(VI) and variable
                 latency(VL). We show that the frequency variation
                 between chips, between cores on one chip, and between
                 functional units within cores can be reduced to a very
                 small range. The effectiveness of these techniques are
                 further verified through experiments on test chips
                 fabricated in a 130nm CMOS process. Detailed
                 architectural simulations of multi-core processors
                 demonstrate significant performance and power
                 advantages are possible by combining variable latency
                 with voltage interpolation.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "chip multiprocessor; microarchitecture; process
                 variations",
}

@Article{Wilkerson:2008:TCC,
  author =       "Chris Wilkerson and Hongliang Gao and Alaa R.
                 Alameldeen and Zeshan Chishti and Muhammad Khellah and
                 Shih-Lien Lu",
  title =        "Trading off Cache Capacity for Reliability to Enable
                 Low Voltage Operation",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "3",
  pages =        "203--214",
  month =        jun,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1394608.1382139",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Aug 6 08:35:03 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "One of the most effective techniques to reduce a
                 processor's power consumption is to reduce supply
                 voltage. However, reducing voltage in the context of
                 manufacturing-induced parameter variations can cause
                 many types of memory circuits to fail. As a result,
                 voltage scaling is limited by a minimum voltage, often
                 called Vccmin, beyond which circuits may not operate
                 reliably. Large memory structures (e.g., caches)
                 typically set Vccmin for the whole processor. In this
                 paper, we propose two architectural techniques that
                 enable microprocessor caches (L1and L2), to operate at
                 low voltages despite very high memory cell failure
                 rates. The Word-disable scheme combines two consecutive
                 cache lines, to form a single cache line where only
                 non-failing words are used. The Bit-fix scheme uses a
                 quarter of the ways in a cache set to store positions
                 and fix bits for failing bits in other ways of the set.
                 During high voltage operation, both schemes allow use
                 of the entire cache. During low voltage operation, they
                 sacrifice cache capacity by 50\% and 25\%,
                 respectively, to reduce Vccmin below 500mV. Compared to
                 current designs with a Vccmin of 825mV, our schemes
                 enable a 40\% voltage reduction, which reduces power by
                 85\% and energy per instruction (EPI) by 53\%",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "cache; cache design; low power; low voltage;
                 reliability; SRAM; stability; Vccmin",
}

@Article{Roesner:2008:CDP,
  author =       "Franziska Roesner and Doug Burger and Stephen W.
                 Keckler",
  title =        "Counting Dependence Predictors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "3",
  pages =        "215--226",
  month =        jun,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1394608.1382140",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Aug 6 08:35:03 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Modern processors rely on memory dependence prediction
                 to execute load instructions as early as possible,
                 speculating that they are not dependent on an earlier,
                 unissued store. To date, the most sophisticated
                 dependence predictors, such as Store Sets, have been
                 tightly coupled to the fetch and execution streams,
                 requiring global knowledge of the in-flight stream of
                 stores to synchronize loads with specific stores. This
                 paper proposes a new dependence predictor design,
                 called a Counting Dependence Predictor (CDP). The key
                 feature of CDPs is that the prediction mechanism
                 predicts some set of events for which a particular
                 dynamic load should wait, which may include some number
                 of matching stores. By waiting for local events only,
                 this dependence predictor can work effectively in a
                 distributed microarchitecture where centralized fetch
                 and execution streams are infeasible or undesirable. We
                 describe and evaluate a distributed Counting Dependence
                 Predictor and protocol that achieves 92\% of the
                 performance of perfect memory disambiguation. It
                 outperforms a load-wait table, similar to the Alpha
                 21264, by 11\%. Idealized, centralized implementations
                 of Store Sets and the Exclusive Collision Predictor,
                 both of which would be difficult to implement in a
                 distributed microarchitecture, achieve 97\% and 94\% of
                 oracular performance, respectively.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "dependence prediction; memory systems; multiprocessor
                 and multicore architectures",
}

@Article{Jerger:2008:VCT,
  author =       "Natalie Enright Jerger and Li-Shiuan Peh and Mikko
                 Lipasti",
  title =        "Virtual Circuit Tree Multicasting: a Case for On-Chip
                 Hardware Multicast Support",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "3",
  pages =        "229--240",
  month =        jun,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1394608.1382141",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Aug 6 08:35:03 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Current state-of-the-art on-chip networks provide
                 efficiency, high throughput, and low latency for
                 one-to-one (unicast) traffic. The presence of
                 one-to-many (multicast) or one-to-all (broadcast)
                 traffic can significantly degrade the performance of
                 these designs, since they rely on multiple unicasts to
                 provide one-to-many communication. This results in a
                 burst of packets from a single source and is a very
                 inefficient way of performing multicast and broadcast
                 communication. This inefficiency is compounded by the
                 proliferation of architectures and coherence protocols
                 that require multicast and broadcast communication. In
                 this paper, we characterize a wide array of on-chip
                 communication scenarios that benefit from hardware
                 multicast support. We propose Virtual Circuit Tree
                 Multicasting (VCTM) and present a detailed multicast
                 router design that improves network performance by up
                 to 90\% while reducing network activity (hence power)
                 by up to 53\%. Our VCTM router is flexible enough to
                 improve interconnect performance for a broad spectrum
                 of multicasting scenarios,and achieves these benefits
                 with straightforward and inexpensive extensions to a
                 state-of-the-art packet-switched router.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "cache coherence protocol; interconnection network;
                 multiprocessor",
}

@Article{Kodi:2008:IIR,
  author =       "Avinash Karanth Kodi and Ashwini Sarathy and Ahmed
                 Louri",
  title =        "{iDEAL}: Inter-router Dual-Function Energy and
                 Area-Efficient Links for Network-on-Chip {(NoC)}
                 Architectures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "3",
  pages =        "241--250",
  month =        jun,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1394608.1382142",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Aug 6 08:35:03 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Network-on-Chip (NoC) architectures have been adopted
                 by a growing number of multi-core designs as a flexible
                 and scalable solution to the increasing wire delay
                 constraints in the deep sub-micron regime. However, the
                 shrinking feature size limits the performance of NoCs
                 due to power and area constraints. Research into the
                 optimization of NoCs has shown that a reduction in the
                 number of buffers in the NoC routers reduces the power
                 and area overhead but degrades the network performance.
                 In this paper, we propose iDEAL, a low-power
                 area-efficient NoC architecture by reducing the number
                 of buffers within the router. To overcome the
                 performance degradation caused by the reduced buffer
                 size, we propose to use adaptive dual-function links
                 capable of data transmission as well as data storage
                 when required. Simulation results for the proposed
                 architecture show that reducing the router buffer size
                 in half and using the adaptive dual-function links
                 achieves nearly 40\% savings in buffer power, 30\%
                 savings in overall network power and about 41\% savings
                 in the router area, with only a marginal 1-3\% drop in
                 performance. Moreover, the performance in iDEAL can be
                 further improved by aggressive and speculative flow
                 control techniques.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "interconnects; low-power architecture;
                 network-on-chip",
}

@Article{Park:2008:MML,
  author =       "Dongkook Park and Soumya Eachempati and Reetuparna Das
                 and Asit K. Mishra and Yuan Xie and N. Vijaykrishnan
                 and Chita R. Das",
  title =        "{MIRA}: a Multi-layered On-Chip Interconnect Router
                 Architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "3",
  pages =        "251--261",
  month =        jun,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1109/ISCA.2008.13",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Aug 6 08:35:03 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Recently, Network-on-Chip (NoC) architectures have
                 gained popularity to address the interconnect delay
                 problem for designing CMP / multi-core / SoC systems in
                 deep sub-micron technology. However, almost all prior
                 studies have focused on 2D NoC designs. Since three
                 dimensional (3D) integration has emerged to mitigate
                 the interconnect delay problem, exploring the NoC
                 design space in 3D can provide ample opportunities to
                 design high performance and energy-efficient NoC
                 architectures. In this paper, we propose a 3D stacked
                 NoC router architecture, called MIRA, which unlike the
                 3D routers in previous works, is stacked into multiple
                 layers and optimized to reduce the overall area
                 requirements and power consumption. We discuss the
                 design details of a four-layer 3D NoC and its enhanced
                 version with additional express channels, and compare
                 them against a ($ 6 \mu $) 2D design and a baseline 3D
                 design. All the designs are evaluated using a
                 cycle-accurate 3D NoC simulator, and integrated with
                 the Orion power model for performance and power
                 analysis. The simulation results with synthetic and
                 application traces demonstrate that the proposed
                 multi-layered NoC routers can outperform the 2D and
                 na{\"\i}ve 3D designs in terms of performance and
                 power. It can achieve up to 42\% reduction in power
                 consumption and up to 51\% improvement in average
                 latency with synthetic workloads. With real workloads,
                 these benefits are around 67\% and 38\%,
                 respectively.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "3D; express channel; express path; Network-on-Chip;
                 NoC; on-chip interconnect; router architecture",
}

@Article{Hower:2008:REE,
  author =       "Derek R. Hower and Mark D. Hill",
  title =        "{Rerun}: Exploiting Episodes for Lightweight Memory
                 Race Recording",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "3",
  pages =        "265--276",
  month =        jun,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1394608.1382144",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Aug 6 08:35:03 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Multiprocessor deterministic replay has many potential
                 uses in the era of multicore computing, including
                 enhanced debugging, fault tolerance, and intrusion
                 detection. While sources of nondeterminism in a
                 uniprocessor can be recorded efficiently in software,
                 it seems likely that hardware support will be needed in
                 a multiprocessor environment where the outcome of
                 memory races must also be recorded. We develop a memory
                 race recording mechanism, called Rerun, that uses small
                 hardware state ($ \approx 166 $ bytes/core), writes a
                 small race log ($ \approx 4 $ bytes/kilo- instruction),
                 and operates well as the number of cores per system
                 scales (e.g., to 16 cores). Rerun exploits the dual of
                 conventional wisdom in race recording: Rather than
                 record information about individual memory accesses
                 that conflict, we record how long a thread executes
                 without conflicting with other threads. In particular,
                 Rerun passively creates atomic episodes. Each episode
                 is a dynamic instruction sequence that a thread happens
                 to execute without interacting with other threads.
                 Rerun uses Lamport Clocks to order episodes and enable
                 replay of an equivalent execution.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "determinism; multicore; race recording",
}

@Article{Lucia:2008:AAD,
  author =       "Brandon Lucia and Joseph Devietti and Karin Strauss
                 and Luis Ceze",
  title =        "{Atom-Aid}: Detecting and Surviving Atomicity
                 Violations",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "3",
  pages =        "277--288",
  month =        jun,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1394608.1382145",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Aug 6 08:35:03 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Writing shared-memory parallel programs is
                 error-prone. Among the concurrency errors that
                 programmers often face are atomicity violations, which
                 are especially challenging. They happen when
                 programmers make incorrect assumptions about atomicity
                 and fail to enclose memory accesses that should occur
                 atomically inside the same critical section. If these
                 accesses happen to be interleaved with conflicting
                 accesses from different threads, the program might
                 behave incorrectly. Recent architectural proposals
                 arbitrarily group consecutive dynamic memory operations
                 into atomic blocks to enforce memory ordering at a
                 coarse grain. This provides what we call implicit
                 atomicity, as the atomic blocks are not derived from
                 explicit program annotations. In this paper, we make
                 the fundamental observation that implicit atomicity
                 probabilistically hides atomicity violations by
                 reducing the number of interleaving opportunities
                 between memory operations. We then propose Atom-Aid,
                 which creates implicit atomic blocks intelligently
                 instead of arbitrarily, dramatically reducing the
                 probability that atomicity violations will manifest
                 themselves. Atom-Aid is also able to report where
                 atomicity violations might exist in the code, providing
                 resilience and debuggability. We evaluate Atom-Aid
                 using buggy code from applications including Apache,
                 MySQL, and XMMS, showing that Atom-Aid virtually
                 eliminates the manifestation of atomicity violations.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "bug; multiprocessors; parallel programming; software
                 reliability",
}

@Article{Montesinos:2008:DRD,
  author =       "Pablo Montesinos and Luis Ceze and Josep Torrellas",
  title =        "{DeLorean}: Recording and Deterministically Replaying
                 Shared-Memory Multiprocessor Execution Efficiently",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "3",
  pages =        "289--300",
  month =        jun,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1109/ISCA.2008.36",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Aug 6 08:35:03 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Support for deterministic replay of multithreaded
                 execution can greatly help in finding concurrency bugs.
                 For highest effectiveness, replay schemes should (i)
                 record at production-run speed, (ii) keep their logging
                 requirements minute, and (iii) replay at a speed
                 similar to that of the initial execution. In this
                 paper, we propose a new substrate for deterministic
                 replay that provides substantial advances along these
                 axes. In our proposal, processors execute blocks of
                 instructions atomically, as in transactional memory or
                 speculative multithreading, and the system only needs
                 to record the commit order of these blocks. We call our
                 scheme DeLorean. Our results show that DeLorean records
                 execution at a speed similar to that of Release
                 Consistency (RC) execution and replays at about 82\% of
                 its speed. In contrast, most current schemes only
                 record at the speed of Sequential Consistency (SC)
                 execution. Moreover, DeLorean only needs 7.5\% of the
                 log size needed by a state-of-the-art scheme. Finally,
                 DeLorean can be configured to need only 0.6\% of the
                 log size of the state-of-the-art scheme at the cost of
                 recording at 86\% of RC's execution speed --- still
                 faster than SC. In this configuration, the log of an
                 8-processor 5-GHz machine is estimated to be only about
                 20GB per day.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Sankar:2008:IDP,
  author =       "Sriram Sankar and Sudhanva Gurumurthi and Mircea R.
                 Stan",
  title =        "Intra-disk Parallelism: An Idea Whose Time Has Come",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "3",
  pages =        "303--314",
  month =        jun,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1394608.1382147",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Aug 6 08:35:03 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Server storage systems use a large number of disks to
                 achieve high performance, thereby consuming a
                 significant amount of power. In this paper, we propose
                 to significantly reduce the power consumed by such
                 storage systems via intra-disk parallelism, wherein
                 disk drives can exploit parallelism in the I/O request
                 stream. Intra-disk parallelism can facilitate replacing
                 a large disk array with a smaller one, using the
                 minimum number of disk drives needed to satisfy the
                 capacity requirements. We show that the design space of
                 intra-disk parallelism is large and present a taxonomy
                 to formulate specific implementations within this
                 space. Using a set of commercial workloads, we perform
                 a limit study to identify the key performance
                 bottlenecks that arise when we replace a storage array
                 that is tuned to provide high performance with a single
                 high-capacity disk drive. We show that it is possible
                 to match, and even surpass, the performance of a
                 storage array for these workloads by using a single
                 disk drive of sufficient capacity that exploits
                 intra-disk parallelism, while significantly reducing
                 the power consumed by the storage system. We evaluate
                 the performance and power consumption of disk arrays
                 composed of intra-disk parallel drives, and discuss
                 engineering and cost issues related to the
                 implementation and deployment of such disk drives.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "disk; I/O; parallelism; power; storage",
}

@Article{Lim:2008:UDN,
  author =       "Kevin Lim and Parthasarathy Ranganathan and Jichuan
                 Chang and Chandrakant Patel and Trevor Mudge and Steven
                 Reinhardt",
  title =        "Understanding and Designing New Server Architectures
                 for Emerging Warehouse-Computing Environments",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "3",
  pages =        "315--326",
  month =        jun,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1394608.1382148",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Aug 6 08:35:03 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "This paper seeks to understand and design
                 next-generation servers for emerging
                 'warehouse-computing' environments. We make two key
                 contributions. First, we put together a detailed
                 evaluation infrastructure including a new benchmark
                 suite for warehouse-computing workloads, and detailed
                 performance, cost, and power models, to quantitatively
                 characterize bottlenecks. Second, we study a new
                 solution that incorporates volume non-server-class
                 components in novel packaging solutions, with memory
                 sharing and flash-based disk caching. Our results show
                 that this approach has promise, with a 2X improvement
                 on average in performance-per-dollar for our benchmark
                 suite.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "evaluation; server architecture; warehouse-computing",
}

@Article{Kgil:2008:INF,
  author =       "Taeho Kgil and David Roberts and Trevor Mudge",
  title =        "Improving {NAND} Flash Based Disk Caches",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "3",
  pages =        "327--338",
  month =        jun,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1109/ISCA.2008.32",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Aug 6 08:35:03 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Flash is a widely used storage device that provides
                 high density and low power, appealing properties for
                 general purpose computing. Today, its usual application
                 is in portable special purpose devices such as MP3
                 players. In this paper we examine its use in the server
                 domain --- a more general purpose environment.
                 Aggressive process scaling and the use of multi-level
                 cells continues to improve density ahead of Moore's Law
                 predictions, making Flash even more attractive as a
                 general purpose memory solution. Unfortunately,
                 reliability limits the use of Flash. To seriously
                 consider Flash in the server domain, architectural
                 support must exist to address this concern. This paper
                 first shows how Flash can be used in today's server
                 platforms as a disk cache. It then proposes two
                 improvements. The first improves performance and
                 reliability by splitting Flash based disk caches into
                 separate read and write regions. The second improves
                 reliability by employing a programmable Flash memory
                 controller. It can change the error code strength
                 (number of correctable bits) and the number of bits
                 that a memory cell can store (cell density) according
                 to the demands of the application. Our studies show
                 that Flash reduces overall power consumed by the system
                 memory and hard disk drive up to 3 times while
                 maintaining performance. We also show that Flash
                 lifetime can be improved by a factor of 20 when using a
                 programmable Flash memory controller, if some
                 performance degradation (below 5\%) is acceptable.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "data center; disk cache; Flash; Flash memory
                 controller; NAND Flash",
}

@Article{Li:2008:OEA,
  author =       "Xiaodong Li and Sarita V. Adve and Pradip Bose and
                 Jude A. Rivers",
  title =        "Online Estimation of Architectural Vulnerability
                 Factor for Soft Errors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "3",
  pages =        "341--352",
  month =        jun,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1394608.1382150",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Aug 6 08:35:03 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "As CMOS technology scales and more transistors are
                 packed on to the same chip, soft error reliability has
                 become an increasingly important design issue for
                 processors. Prior research has shown that there is
                 significant architecture-level masking, and many soft
                 error solutions take advantage of this effect. Prior
                 work has also shown that the degree of such masking can
                 vary significantly across workloads and between
                 individual workload phases, motivating dynamic
                 adaptation of reliability solutions for optimal cost
                 and benefit. For such adaptation, it is important to be
                 able to accurately estimate the amount of masking or
                 the architecture vulnerability factor (AVF) online,
                 while the program is running. Unfortunately, existing
                 solutions for estimating AVF are often based on offline
                 simulators and hard to implement in real processors.
                 This paper proposes a novel way of estimating AVF
                 online, using simple modifications to the processor.
                 The estimation method applies to both logic and storage
                 structures on the processor. Compared to previous
                 methods for estimating AVF, our method does not require
                 any offline simulation or calibration for different
                 workloads. We tested our method with a widely used
                 simulator from industry, for four processor structures
                 and for 100 to 200 intervals of each of eleven SPEC
                 benchmarks. The results show that our method provides
                 acceptably accurate AVF estimates at runtime. The
                 absolute error rarely exceeds 0.08 across all
                 application intervals for all structures, and the mean
                 absolute error for a given application and structure
                 combination is always within 0.05.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "AVF estimation; processor reliability; soft error",
}

@Article{Shin:2008:PWR,
  author =       "Jeonghee Shin and Victor Zyuban and Pradip Bose and
                 Timothy M. Pinkston",
  title =        "A Proactive Wearout Recovery Approach for Exploiting
                 Microarchitectural Redundancy to Extend Cache {SRAM}
                 Lifetime",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "3",
  pages =        "353--362",
  month =        jun,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1394608.1382151",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Aug 6 08:35:03 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Microarchitectural redundancy has been proposed as a
                 means of improving chip lifetime reliability. It is
                 typically used in a reactive way, allowing chips to
                 maintain operability in the presence of failures by
                 detecting and isolating, correcting, and/or replacing
                 components on a first-come, first-served basis only
                 after they become faulty. In this paper, we explore an
                 alternative, more preferred method of exploiting
                 microarchitectural redundancy to enhance chip lifetime
                 reliability. In our proposed approach, redundancy is
                 used proactively to allow non-faulty microarchitecture
                 components to be temporarily deactivated, on a rotating
                 basis, to suspend and/or recover from certain wearout
                 effects. This approach improves chip lifetime
                 reliability by warding off the onset of wearout
                 failures as opposed to reacting to them posteriorly.
                 Applied to on-chip cache SRAM for combating
                 NBTI-induced wearout failure, our proactive wearout
                 recovery approach increases lifetime reliability
                 (measured in mean-time-to-failure) of the cache by
                 about a factor of seven relative to no use of
                 microarchitectural redundancy and a factor of five
                 relative to conventional reactive use of redundancy
                 having similar area overhead.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "lifetime reliability; microarchitectural redundancy;
                 proactive approach; wearout recovery",
}

@Article{Teodorescu:2008:VAA,
  author =       "Radu Teodorescu and Josep Torrellas",
  title =        "Variation-Aware Application Scheduling and Power
                 Management for Chip Multiprocessors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "3",
  pages =        "363--374",
  month =        jun,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1109/ISCA.2008.40",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Aug 6 08:35:03 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Within-die process variation causes individual cores
                 in a ChipMultiprocessor (CMP) to differ substantially
                 in both static power consumed and maximum frequency
                 supported. In this environment,ignoring variation
                 effects when scheduling applications or when managing
                 power with Dynamic Voltage and Frequency Scaling (DVFS)
                 is suboptimal. This paper proposes variation-aware
                 algorithms for application scheduling and power
                 management. One such power management algorithm, called
                 {\em LinOpt}, uses linear programming to find the best
                 voltage and frequency levels for each of the cores in
                 the CMP --- maximizing throughput at a given power
                 budget. In a 20-core CMP, the combination of
                 variation-aware application scheduling and {\em
                 LinOpt\/} increases the average throughput by 12--17\%
                 and reduces the average $ E D^2 $ by 30--38\% --- all
                 relative to using variation-aware scheduling together
                 with a simple extension to Intel's Foxton power
                 management algorithm.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "application scheduling; power management; process
                 variation",
}

@Article{Chen:2008:FHA,
  author =       "Shimin Chen and Michael Kozuch and Theodoros Strigkos
                 and Babak Falsafi and Phillip B. Gibbons and Todd C.
                 Mowry and Vijaya Ramachandran and Olatunji Ruwase and
                 Michael Ryan and Evangelos Vlachos",
  title =        "Flexible Hardware Acceleration for Instruction-Grain
                 Program Monitoring",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "3",
  pages =        "377--388",
  month =        jun,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1394608.1382153",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Aug 6 08:35:03 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Instruction-grain program monitoring tools, which
                 check and analyze executing programs at the granularity
                 of individual instructions, are invaluable for quickly
                 detecting bugs and security attacks and then limiting
                 their damage (via containment and/or recovery).
                 Unfortunately, their fine-grain nature implies very
                 high monitoring overheads for software-only tools,
                 which are typically based on dynamic binary
                 instrumentation. Previous hardware proposals either
                 focus on mechanisms that target specific bugs or
                 address only the cost of binary instrumentation. In
                 this paper, we propose a flexible hardware solution for
                 accelerating a wide range of instruction-grain
                 monitoring tools. By examining a number of diverse
                 tools (for memory checking, security tracking, and data
                 race detection), we identify three significant common
                 sources of overheads and then propose three novel
                 hardware techniques for addressing these overheads:
                 Inheritance Tracking, Idempotent Filters, and
                 Metadata-TLBs. Together, these constitute a
                 general-purpose hardware acceleration framework.
                 Experimental results show our framework reduces
                 overheads by 2-3X over the previous state-of-the-art,
                 while supporting the needed flexibility.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "hardware acceleration; idempotent filter; inheritance
                 tracking; instruction-grain program monitoring; LBA;
                 lifeguards; log-based architectures; metadata-TLB",
}

@Article{Clark:2008:VVE,
  author =       "Nathan Clark and Amir Hormati and Scott Mahlke",
  title =        "{VEAL}: Virtualized Execution Accelerator for Loops",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "3",
  pages =        "389--400",
  month =        jun,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1109/ISCA.2008.33",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Aug 6 08:35:03 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Performance improvement solely through transistor
                 scaling is becoming more and more difficult, thus it is
                 increasingly common to see domain specific accelerators
                 used in conjunction with general purpose processors to
                 achieve future performance goals. There is a serious
                 drawback to accelerators, though: binary compatibility.
                 An application compiled to utilize an accelerator
                 cannot run on a processor without that accelerator, and
                 applications that do not utilize an accelerator will
                 never use it. To overcome this problem, we propose
                 decoupling the instruction set architecture from the
                 underlying accelerators. Computation to be accelerated
                 is expressed using a processor's baseline instruction
                 set, and light-weight dynamic translation maps the
                 representation to whatever accelerators are available
                 in the system. In this paper, we describe the changes
                 to a compilation framework and processor system needed
                 to support this abstraction for an important set of
                 accelerator designs that support innermost loops. In
                 this analysis, we investigate the dynamic overheads
                 associated with abstraction as well as the
                 static/dynamic tradeoffs to improve the dynamic mapping
                 of loop-nests. As part of the exploration, we also
                 provide a quantitative analysis of the hardware
                 characteristics of effective loop accelerators. We
                 conclude that using a hybrid static-dynamic compilation
                 approach to map computation on to loop-level
                 accelerators is a practical way to increase computation
                 efficiency, without the overheads associated with
                 instruction set modification.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Chen:2008:SSP,
  author =       "Haibo Chen and Xi Wu and Liwei Yuan and Binyu Zang and
                 Pen-chung Yew and Frederic T. Chong",
  title =        "From Speculation to Security: Practical and Efficient
                 Information Flow Tracking Using Speculative Hardware",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "3",
  pages =        "401--412",
  month =        jun,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1394608.1382156",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Aug 6 08:35:03 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Dynamic information flow tracking (also known as taint
                 tracking) is an appealing approach to combat various
                 security attacks. However, the performance of
                 applications can severely degrade without hardware
                 support for tracking taints. This paper observes that
                 information flow tracking can be efficiently emulated
                 using deferred exception tracking in microprocessors
                 supporting speculative execution. Based on this
                 observation, we propose SHIFT, a low-overhead,
                 software-based dynamic information flow tracking system
                 to detect a wide range of attacks. The key idea is to
                 treat tainted state (describing untrusted data) as
                 speculative state (describing deferred exceptions).
                 SHIFT leverages existing architectural support for
                 speculative execution to track tainted state in
                 registers and needs to instrument only load and store
                 instructions to track tainted state in memory using a
                 bitmap, which results in significant performance
                 advantages. Moreover, by decoupling mechanisms for
                 taint tracking from security policies, SHIFT can detect
                 a wide range of exploits, including high-level semantic
                 attacks. We have implemented SHIFT using the Itanium
                 processor, which has support for deferred exceptions,
                 and by modifying GCC to instrument loads and stores. A
                 security assessment shows that SHIFT can detect both
                 low-level memory corruption exploits as well as
                 high-level semantic attacks with no false positives.
                 Performance measurements show that SHIFT incurs about
                 1\% overhead for server applications. The performance
                 slowdown for SPEC-INT2000 is 2.81X and 2.27X for
                 tracking at byte-level and word-level respectively.
                 Minor architectural improvements to the Itanium
                 processor (adding three simple instructions) can reduce
                 the performance slowdown down to 2.32X and 1.8X for
                 byte-level and word-level tracking, respectively.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "deferred exception; dynamic information flow tracking;
                 speculative execution; taint tracking",
}

@Article{Boneti:2008:SCP,
  author =       "Carlos Boneti and Francisco J. Cazorla and Roberto
                 Gioiosa and Alper Buyuktosunoglu and Chen-Yong Cher and
                 Mateo Valero",
  title =        "Software-Controlled Priority Characterization of
                 {POWER5} Processor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "3",
  pages =        "415--426",
  month =        jun,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1109/ISCA.2008.8",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Aug 6 08:35:03 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Due to the limitations of instruction-level
                 parallelism, thread-level parallelism has become a
                 popular way to improve processor performance. One
                 example is the IBM POWER5TM processor, a two-context
                 simultaneous-multithreaded dual-core chip. In each SMT
                 core, the IBM POWER5 features two levels of thread
                 resource balancing and prioritization. The first level
                 provides automatic in-hardware resource balancing,
                 while the second level is a software-controlled
                 priority mechanism that presents eight levels of thread
                 priorities. Currently, software-controlled
                 prioritization is only used in limited number of cases
                 in the software platforms due to lack of performance
                 characterization of the effects of this mechanism. In
                 this work, we characterize the effects of the
                 software-based prioritization on several different
                 workloads. We show that the impact of the
                 prioritization significantly depends on the workloads
                 coscheduled on a core. By prioritizing the right task,
                 it is possible to obtain more than two times of
                 throughput improvement for synthetic workloads compared
                 to the baseline. We also present two application case
                 studies targeting two different performance metrics:
                 the first case study improves overall throughput by
                 23.7\% and the second case study reduces the total
                 execution time by 9.3\%. In addition, we show the
                 circumstances when a background thread can be run
                 transparently without affecting the performance of the
                 foreground thread.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "IBM POWER5; performance characterization; simultaneous
                 multithreading; SMT; software-controlled
                 prioritization",
}

@Article{Shye:2008:LLR,
  author =       "Alex Shye and Berkin Ozisikyilmaz and Arindam Mallik
                 and Gokhan Memik and Peter A. Dinda and Robert P. Dick
                 and Alok N. Choudhary",
  title =        "Learning and Leveraging the Relationship between
                 Architecture-Level Measurements and Individual User
                 Satisfaction",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "3",
  pages =        "427--438",
  month =        jun,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1394608.1382158",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Aug 6 08:35:03 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "The ultimate goal of computer design is to satisfy the
                 end-user. In particular computing domains, such as
                 interactive applications, there exists a variation in
                 user expectations and user satisfaction relative to the
                 performance of existing computer systems. In this work,
                 we leverage this variation to develop more efficient
                 architectures that are customized to end-users. We
                 first investigate the relationship between
                 microarchitectural parameters and user satisfaction.
                 Specifically, we analyze the relationship between
                 hardware performance counter (HPC) readings and
                 individual satisfaction levels reported by users for
                 representative applications. Our results show that the
                 satisfaction of the user is strongly correlated to the
                 performance of the underlying hardware. More
                 importantly, the results show that user satisfaction is
                 highly user-dependent. To take advantage of these
                 observations, we develop a framework called
                 Individualized Dynamic Voltage and Frequency Scaling
                 (iDVFS). We study a group of users to characterize the
                 relationship between the HPCs and individual user
                 satisfaction levels. Based on this analysis, we use
                 artificial neural networks to model the function from
                 HPCs to user satisfaction for individual users. This
                 model is then used online to predict user satisfaction
                 and set the frequency level accordingly. A second set
                 of user studies demonstrates that iDVFS reduces the CPU
                 power consumption by over 25\% in representative
                 applications as compared to the Windows XP DVFS
                 algorithm.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "dynamic power management; hardware performance
                 counters; learning user satisfaction; user-aware
                 architectures",
}

@Article{Kumar:2008:AVO,
  author =       "Sanjeev Kumar and Daehyun Kim and Mikhail Smelyanskiy
                 and Yen-Kuang Chen and Jatin Chhugani and Christopher
                 J. Hughes and Changkyu Kim and Victor W. Lee and
                 Anthony D. Nguyen",
  title =        "Atomic Vector Operations on Chip Multiprocessors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "3",
  pages =        "441--452",
  month =        jun,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1394608.1382154",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Aug 6 08:35:03 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "The current trend is for processors to deliver
                 dramatic improvements in parallel performance while
                 only modestly improving serial performance. Parallel
                 performance is harvested through vector/SIMD
                 instructions as well as multithreading (through both
                 multithreaded cores and chip multiprocessors). Vector
                 parallelism can be more efficiently supported than
                 multithreading, but is often harder for software to
                 exploit. In particular, code with sparse data access
                 patterns cannot easily utilize the vector/SIMD
                 instructions of mainstream processors. Hardware to
                 scatter and gather sparse data has previously been
                 proposed to enable vector execution for these codes.
                 However, on multithreaded architectures, a number of
                 applications spend significant time on atomic
                 operations (e.g., parallel reductions), which cannot be
                 vectorized using previously proposed schemes. This
                 paper proposes architectural support for atomic vector
                 operations (referred to as GLSC) that addresses this
                 limitation. GLSC extends scatter-gather hardware to
                 support atomic memory operations. Our experiments show
                 that the GLSC provides an average performance
                 improvement on a set of important RMS kernels of 54\%
                 for 4-wide SIMD.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "locks; multiprocessors; reductions; SIMD; vector",
}

@Article{Loh:2008:SMA,
  author =       "Gabriel H. Loh",
  title =        "{$3$D}-Stacked Memory Architectures for Multi-core
                 Processors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "3",
  pages =        "453--464",
  month =        jun,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1394608.1382159",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Aug 6 08:35:03 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Three-dimensional integration enables stacking memory
                 directly on top of a microprocessor, thereby
                 significantly reducing wire delay between the two.
                 Previous studies have examined the performance benefits
                 of such an approach, but all of these works only
                 consider commodity 2D DRAM organizations. In this work,
                 we explore more aggressive 3D DRAM organizations that
                 make better use of the additional die-to-die bandwidth
                 provided by 3D stacking, as well as the additional
                 transistor count. Our simulation results show that with
                 a few simple changes to the 3D-DRAM organization, we
                 can achieve a 1.75x speedup over previously proposed
                 3D-DRAM approaches on our memory-intensive
                 multi-programmed workloads on a quad-core processor.
                 The significant increase in memory system performance
                 makes the L2 miss handling architecture (MHA) a new
                 bottleneck, which we address by combining a novel data
                 structure called the Vector Bloom Filter with dynamic
                 MSHR capacity tuning. Our scalable L2 MHA yields an
                 additional 17.8\% performance improvement over our
                 3D-stacked memory architecture.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "3D integration; memory; multi-core",
}

@Article{Anonymous:2008:AI,
  author =       "Anonymous",
  title =        "Author Index",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "3",
  pages =        "465--466",
  month =        jun,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1394608.1382160",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Aug 6 08:35:03 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Anonymous:2008:PI,
  author =       "Anonymous",
  title =        "{Publisher}'s Information",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "3",
  pages =        "468--468",
  month =        jun,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1394608.1382161",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Aug 6 08:35:03 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Anonymous:2008:CA,
  author =       "Anonymous",
  title =        "Cover Art",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "3",
  pages =        "C1--C1",
  month =        jun,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1394608.1382162",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Aug 6 08:35:03 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Karne:2008:OSC,
  author =       "Ramesh K. Karne and Alexander L. Wijesinha and George
                 H. {Ford, Jr.}",
  title =        "Opinion: stay on course with an evolution or choose a
                 revolution in computing",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "4",
  pages =        "1--6",
  month =        sep,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1462609.1462611",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Dec 8 14:01:02 MST 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Thorson:2008:INa,
  author =       "Mark Thorson",
  title =        "{Internet} Nuggets",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "4",
  pages =        "7--11",
  month =        sep,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1462609.1462613",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Dec 8 14:01:02 MST 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Bengtsson:2008:DSA,
  author =       "Jerker Bengtsson and Bertil Svensson",
  title =        "A domain-specific approach for software development on
                 {Manycore} platforms",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "5",
  pages =        "2--10",
  month =        dec,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1556444.1556446",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Jun 26 11:50:56 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "The programming complexity of increasingly parallel
                 processors calls for new tools that assist programmers
                 in utilising the parallel hardware resources. In this
                 paper we present a set of models that we have developed
                 as part of a tool for mapping dataflow graphs onto
                 manycores. One of the models captures the essentials of
                 manycores identified as suitable for signal processing,
                 and which we use as target for our algorithms. As an
                 intermediate representation we introduce timed
                 configuration graphs, which describe the mapping of a
                 model of an application onto a machine model. Moreover,
                 we show how a timed configuration graph by very simple
                 means can be evaluated using an abstract interpretation
                 to obtain performance feedback. This information can be
                 used by our tool and by the programmer in order to
                 discover improved mappings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Cederman:2008:SLB,
  author =       "Daniel Cederman and Philippas Tsigas",
  title =        "On sorting and load balancing on {GPUs}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "5",
  pages =        "11--18",
  month =        dec,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1556444.1556447",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Jun 26 11:50:56 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "In this paper we take a look at GPU-Quicksort, an
                 efficient Quicksort algorithm suitable for the highly
                 parallel multi-core graphics processors. Quicksort had
                 previously been considered an inefficient sorting
                 solution for graphics processors, but GPU-Quicksort
                 often performs better than the fastest known sorting
                 implementations for graphics processors, such as radix
                 and bitonic sort. Quicksort can thus be seen as a
                 viable alternative for sorting large quantities of data
                 on graphics processors.\par

                 We also take look at a comparison of different load
                 balancing schemes. To get maximum performance on the
                 many-core graphics processors it is important to have
                 an even balance of the workload so that all processing
                 units contribute equally to the task at hand. This can
                 be hard to achieve when the cost of a task is not known
                 beforehand and when new sub-tasks are created
                 dynamically during execution. With the recent advent of
                 scatter operations and atomic hardware primitives it is
                 now possible to bring some of the more elaborate
                 dynamic load balancing schemes from the conventional
                 SMP systems domain to the graphics processor domain.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ha:2008:NBP,
  author =       "Phuong Hoai Ha and Philippas Tsigas and Otto J.
                 Anshus",
  title =        "Non-blocking programming on multi-core graphics
                 processors: (extended abstract)",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "5",
  pages =        "19--28",
  month =        dec,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1556444.1556448",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Jun 26 11:50:56 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "This paper investigates the synchronization power of
                 coalesced memory accesses, a family of memory access
                 mechanisms introduced in recent large multicore
                 architectures like the CUDA graphics processors. We
                 first design three memory access models to capture the
                 fundamental features of the new memory access
                 mechanisms. Subsequently, we prove the exact
                 synchronization power of these models in terms of their
                 consensus numbers. These tight results show that the
                 coalesced memory access mechanisms can facilitate
                 strong synchronization between the threads of multicore
                 processors, without the need of synchronization
                 primitives other than reads and writes.\par

                 Moreover, based on the intrinsic features of recent GPU
                 architectures, we construct strong synchronization
                 objects like wait-free and t-resilient
                 read-modify-write objects for a general model of recent
                 GPU architectures without strong hardware
                 synchronization primitives like test-and-set and
                 compare-and-swap. Accesses to the wait-free objects
                 have time complexity $ O(N) $, where $N$ is the number
                 of processes. Our result demonstrates that it is
                 possible to construct waitfree synchronization
                 mechanisms for GPUs without the need of strong
                 synchronization primitives in hardware and that
                 wait-free programming is possible for GPUs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Bhattacharyya:2008:ODT,
  author =       "Shuvra S. Bhattacharyya and Gordon Brebner and
                 J{\"o}rn W. Janneck and Johan Eker and Carl von Platen
                 and Marco Mattavelli and Micka{\"e}l Raulet",
  title =        "{OpenDF}: a dataflow toolset for reconfigurable
                 hardware and multicore systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "5",
  pages =        "29--35",
  month =        dec,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1556444.1556449",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Jun 26 11:50:56 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "This paper presents the OpenDF framework and recalls
                 that dataflow programming was once invented to address
                 the problem of parallel computing. We discuss the
                 problems with an imperative style, von Neumann
                 programs, and present what we believe are the
                 advantages of using a dataflow programming model. The
                 CAL actor language is briefly presented and its role in
                 the ISO/MPEG standard is discussed. The Dataflow
                 Interchange Format (DIF) and related tools can be used
                 for analysis of actors and networks, demonstrating the
                 advantages of a dataflow approach. Finally, an overview
                 of a case study implementing an MPEG- 4 decoder is
                 given.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kessler:2008:OCP,
  author =       "Christoph W. Kessler and J{\"o}rg Keller",
  title =        "Optimized on-chip pipelining of memory-intensive
                 computations on the cell {BE}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "5",
  pages =        "36--45",
  month =        dec,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1556444.1556450",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Jun 26 11:50:56 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Multiprocessors-on-chip, such as the Cell BE
                 processor, regularly suffer from restricted bandwidth
                 to off-chip main memory. We propose to reduce memory
                 bandwidth requirements, and thus increase performance,
                 by expressing our application as a task graph, by
                 running dependent tasks concurrently and by pipelining
                 results directly from task to task where possible,
                 instead of buffering in off-chip memory. To maximize
                 bandwidth savings and balance load simultaneously, we
                 solve a mapping problem of tasks to SPEs on the Cell
                 BE. We present three approaches: an integer linear
                 programming formulation that allows to compute
                 Paretooptimal mappings for smaller task graphs, general
                 heuristics, and a problem specific approximation
                 algorithm. We validate the mappings for dataparallel
                 computations and sorting.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Lundvall:2008:APS,
  author =       "H{\aa}kan Lundvall and Kristian Stav{\aa}ker and Peter
                 Fritzson and Christoph Kessler",
  title =        "Automatic parallelization of simulation code for
                 equation-based models with software pipelining and
                 measurements on three platforms",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "5",
  pages =        "46--55",
  month =        dec,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1556444.1556451",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Jun 26 11:50:56 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "In this work we report results from a new integrated
                 method of automatically generating parallel code from
                 Modelica models by combining parallelization at two
                 levels of abstraction. Performing inline expansion of a
                 Runge--Kutta solver combined with fine-grained
                 automatic parallelization of the right-hand side of the
                 resulting equation system opens up new possibilities
                 for generating high performance code, which is becoming
                 increasingly relevant when multi-core computers are
                 becoming commonplace. An implementation, in the form of
                 a backend module for the OpenModelica compiler, has
                 been developed and used for measurements on two
                 architectures: Intel Xeon and SGI Altix 3700 Bx2. This
                 paper also contains some very recent results of a
                 prototype implementation of this parallelization
                 approach on the Cell BE processor architecture.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Fang:2008:SDA,
  author =       "Huan Fang and Mats Brorsson",
  title =        "Scalable directory architecture for distributed shared
                 memory chip multiprocessors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "5",
  pages =        "56--64",
  month =        dec,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1556444.1556452",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Jun 26 11:50:56 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Traditional Directory-based cache coherence protocol
                 is far from optimal for large-scale cache coherent
                 shared memory multiprocessors due to the increasing
                 latency to access directories stored in DRAM memory.
                 Instead of keeping directories in main memory, we
                 consider distributing the directory together with L2
                 cache across all nodes on a Chip Multiprocessor. Each
                 node contains a processing unit, a private L1 cache, a
                 slice of L2 cache, memory controller and a router. Both
                 L2 cache and memories are distributed shared and
                 interleaved by a subset of memory address bits. All
                 nodes are interconnected through a low latency two
                 dimensional Mesh network. Directory, being a split
                 component to L2 cache, only stores sharing information
                 for blocks while L2 cache stores only data blocks
                 exclusive with L1 cache. Shared L2 cache can increase
                 total effective cache capacity on chip, but also
                 increase the miss latency when data is on a remote
                 node. Being different from Directory Cache structure,
                 our proposal totally removes the directory from memory,
                 which saves memory space and reduces access latency.
                 Compared to L2 cache that combines directory
                 information internally, our L2 cache structure saves up
                 to 88\% cache space and achieves similar performance.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Jonsson:2008:SSE,
  author =       "Bengt Jonsson",
  title =        "State-space exploration for concurrent algorithms
                 under weak memory orderings: (preliminary version)",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "5",
  pages =        "65--71",
  month =        dec,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1556444.1556453",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Jun 26 11:50:56 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Several concurrent implementations of familiar data
                 abstractions such as queues, sets, or maps typically do
                 not follow locking disciplines, and often use lock-free
                 synchronization to gain performance. Since such
                 algorithms are exposed to a weak memory model, they are
                 notoriously hard to get correct, as witnessed by many
                 bugs found in published algorithms. We outline a
                 technique for analyzing correctness of concurrent
                 algorithms under weak memory models, in which a model
                 checker is used to search for correctness violations.
                 The algorithm to be analyzed is transformed into a form
                 where statements may be reordered according to a
                 particular weak memory ordering. The transformed
                 algorithm can then be analyzed by a model-checking
                 tool, e.g., by enumerative state exploration. We
                 illustrate the approach on a small example of a queue,
                 which allows an enqueue operation to be concurrent with
                 a dequeue operation, which we analyze with respect to
                 the RMO memory model defined in SPARC v9.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Abdulla:2008:MCR,
  author =       "Parosh Aziz Abdulla and Fr{\'e}d{\'e}ric Haziza and
                 Mats Kindahl",
  title =        "Model checking race-freeness",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "5",
  pages =        "72--79",
  month =        dec,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1556444.1556454",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Jun 26 11:50:56 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "With the introduction of highly concurrent systems in
                 standard desktop computers, ensuring correctness of
                 industrial-size concurrent programs is becoming
                 increasingly important. One of the most important
                 standards in use for developing multi-threaded programs
                 is the POSIX Threads standard, commonly known as
                 PThreads. Of particular importance, the analysis of
                 industrial code should, as far as possible, be
                 automatic and not require annotations or other forms of
                 specifications of the code.\par

                 Model checking has been one of the most successful
                 approaches to program verification during the last two
                 decades. The size and complexity of applications which
                 can be handled have increased rapidly through
                 integration with symbolic techniques. These methods are
                 designed to work on finite (but large) state spaces.
                 This framework fails to deal with several essential
                 aspects of behaviours for multithreaded programs: there
                 is no bound a priori on the number of threads which may
                 arise in a given run of the system; each thread
                 manipulates local variables which often range over
                 unbounded domains; and the system has a dynamic
                 structure in the sense that threads can be created and
                 killed throughout execution of the system. In this
                 paper we concentrate on checking a particular class of
                 properties for concurrent programs, namely safety
                 properties. In particular, we focus on race-freeness,
                 that is, the absence of race conditions (also known as
                 data races) in shared-variable pthreaded
                 programs.\par

                 We will follow a particular methodology which we have
                 earlier developed for model checking general classes of
                 infinite-state systems [1, 3, 6, 8, 9] and apply a
                 symbolic backward reachability analysis to verify the
                 safety property. Since we construct a model as an
                 over-approximation of the original program, proving the
                 safety property in the model implies that the property
                 also holds in the original system. Surprisingly, it
                 leads to a quite efficient analysis which can be
                 carried out fully automatically.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Sundell:2008:NNB,
  author =       "Hakan Sundell and Philippas Tsigas",
  title =        "{NOBLE}: non-blocking programming support via
                 lock-free shared abstract data types",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "5",
  pages =        "80--87",
  month =        dec,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1556444.1556455",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Jun 26 11:50:56 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "An essential part of programming for multi-core and
                 multi-processor includes efficient and reliable means
                 for sharing data. Lock-free data structures are known
                 as very suitable for this purpose, although experienced
                 to be very complex to design. In this paper, we present
                 a software library of non-blocking abstract data types
                 that have been designed to facilitate lock-free
                 programming for non-experts. The system provides: (i)
                 efficient implementations of the most commonly used
                 data types in concurrent and sequential software
                 design, (ii) a lock-free memory management system, and
                 (iii) a run time-system. The library provides clear
                 semantics that are at least as strong as those of
                 corresponding lock-based implementations of the
                 respective data types. Our software library can be used
                 for facilitating lockfree programming; its design
                 enables the programmer to: (i) replace lock-based
                 components of sequential or parallel code easily and
                 efficiently , (ii) use well-tuned concurrent algorithms
                 inside a software or hardware transactional system. In
                 the paper we describe the design and functionality of
                 the system. We also provide experimental results that
                 show that the library can considerably improve the
                 performance of software systems.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Gidenstam:2008:LLF,
  author =       "Anders Gidenstam and Marina Papatriantafilou",
  title =        "{LFTHREADS}: a lock-free thread library",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "5",
  pages =        "88--92",
  month =        dec,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1556444.1556456",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Jun 26 11:50:56 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "This extended abstract presents LFTHREADS, a thread
                 library entirely based on lock-free methods, i.e. no
                 spinlocks or similar synchronization mechanisms are
                 employed in the implementation of the multithreading.
                 Since lockfreedom is highly desirable in
                 multiprocessors/multicores due to its advantages in
                 parallelism, fault-tolerance, convoy-avoidance and
                 more, there is an increased demand in lock-free methods
                 in parallel applications, hence also in
                 multiprocessor/multicore system services. LFTHREADS is
                 the first thread library that provides a lock-free
                 implementation of blocking synchronization primitives
                 for application threads; although the latter may sound
                 like a contradicting goal, such objects have several
                 benefits: e.g. library operations that block and
                 unblock threads on the same synchronization object can
                 make progress in parallel while maintaining the desired
                 thread-level semantics and without having to wait for
                 any 'low' operations among them. Besides, as no
                 spin-locks or similar synchronization mechanisms are
                 employed, memory contention can be reduced and
                 processors/cores are able to do useful work. As a
                 consequence, applications, too, can enjoy enhanced
                 parallelism and fault-tolerance. For the
                 synchronization in LFTHREADS we have introduced a new
                 method, which we call responsibility hand-off (RHO),
                 that does not need any special kernel support. The RHO
                 method is also of independent interest, as it can also
                 serve as a tool for lock-free token passing, management
                 of contention and interaction between scheduling and
                 synchronization. This paper gives an outline and the
                 context of LFTHREADS. For more details the reader is
                 referred to [7] and [8].",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Faxen:2008:WWS,
  author =       "Karl-Filip Fax{\'e}n",
  title =        "{Wool} --- a work stealing library",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "5",
  pages =        "93--100",
  month =        dec,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1556444.1556457",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Jun 26 11:50:56 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "This paper presents some preliminary results on a
                 small light weight user level task management library
                 called Wool. The Wool task scheduler is based on work
                 stealing. The objective of the library is to provide a
                 reasonably convenient programming interface (in
                 particular by not forcing the programmer to write in
                 continuation passing style) in ordinary C while still
                 having a very low task creation overhead. Several task
                 scheduling systems based on work stealing exists, but
                 they are typically either programming languages like
                 Cilk-5 or based on C++ like the Intel TBB or C\# as in
                 the Microsoft TPL. Our main conclusions are that such a
                 direct style interface is indeed possible and yields
                 performance that is comparable to that of the Intel
                 TBB.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Thorson:2008:INb,
  author =       "Mark Thorson",
  title =        "{Internet} nuggets",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "36",
  number =       "5",
  pages =        "101--111",
  month =        dec,
  year =         "2008",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1556444.1556459",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Jun 26 11:50:56 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

%%% TO DO: [26-Jun-2009] Volume 36 number 6: no data yet in ACM Portal database

@Article{Gebhart:2009:ETC,
  author =       "Mark Gebhart and Bertrand A. Maher and Katherine E.
                 Coons and Jeff Diamond and Paul Gratz and Mario Marino
                 and Nitya Ranganathan and Behnam Robatmili and Aaron
                 Smith and James Burrill and Stephen W. Keckler and Doug
                 Burger and Kathryn S. McKinley",
  title =        "An evaluation of the {TRIPS} computer system",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "37",
  number =       "1",
  pages =        "1--12",
  month =        mar,
  year =         "2009",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2528521.1508246",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:47:19 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "The TRIPS system employs a new instruction set
                 architecture (ISA) called Explicit Data Graph Execution
                 (EDGE) that renegotiates the boundary between hardware
                 and software to expose and exploit concurrency. EDGE
                 ISAs use a block-atomic execution model in which blocks
                 are composed of dataflow instructions. The goal of the
                 TRIPS design is to mine concurrency for high
                 performance while tolerating emerging technology
                 scaling challenges, such as increasing wire delays and
                 power consumption. This paper evaluates how well TRIPS
                 meets this goal through a detailed ISA and performance
                 analysis. We compare performance, using cycles counts,
                 to commercial processors. On SPEC CPU2000, the Intel
                 Core 2 outperforms compiled TRIPS code in most cases,
                 although TRIPS matches a Pentium 4. On simple
                 benchmarks, compiled TRIPS code outperforms the Core 2
                 by 10\% and hand-optimized TRIPS code outperforms it by
                 factor of 3. Compared to conventional ISAs, the
                 block-atomic model provides a larger instruction
                 window, increases concurrency at a cost of more
                 instructions executed, and replaces register and memory
                 accesses with more efficient direct
                 instruction-to-instruction communication. Our analysis
                 suggests ISA, microarchitecture, and compiler
                 enhancements for addressing weaknesses in TRIPS and
                 indicates that EDGE architectures have the potential to
                 exploit greater concurrency in future technologies.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS 2009 conference proceedings.",
}

@Article{Pistol:2009:AIN,
  author =       "Constantin Pistol and Wutichai Chongchitmate and
                 Christopher Dwyer and Alvin R. Lebeck",
  title =        "Architectural implications of nanoscale integrated
                 sensing and computing",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "37",
  number =       "1",
  pages =        "13--24",
  month =        mar,
  year =         "2009",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2528521.1508247",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:47:19 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "This paper explores the architectural implications of
                 integrating computation and molecular probes to form
                 nanoscale sensor processors (nSP). We show how nSPs may
                 enable new computing domains and automate tasks that
                 currently require expert scientific training and costly
                 equipment. This new application domain severely
                 constrains nSP size, which significantly impacts the
                 architectural design space. In this context, we explore
                 nSP architectures and present an nSP design that
                 includes a simple accumulator-based ISA, sensors,
                 limited memory and communication transceivers. To
                 reduce the application memory footprint, we introduce
                 the concept of instruction-fused sensing. We use
                 simulation and analytical models to evaluate nSP
                 designs executing a representative set of target
                 applications. Furthermore, we propose a candidate nSP
                 technology based on optical Resonance Energy Transfer
                 (RET) logic that enables the small size required by the
                 application domain; our smallest design is about the
                 size of the largest known virus. We also show
                 laboratory results that demonstrate initial steps
                 towards a prototype.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS 2009 conference proceedings.",
}

@Article{Park:2009:CEA,
  author =       "Soyeon Park and Shan Lu and Yuanyuan Zhou",
  title =        "{CTrigger}: exposing atomicity violation bugs from
                 their hiding places",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "37",
  number =       "1",
  pages =        "25--36",
  month =        mar,
  year =         "2009",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2528521.1508249",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:47:19 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Multicore hardware is making concurrent programs
                 pervasive. Unfortunately, concurrent programs are prone
                 to bugs. Among different types of concurrency bugs,
                 atomicity violation bugs are common and important.
                 Existing techniques to detect atomicity violation bugs
                 suffer from one limitation: requiring bugs to manifest
                 during monitored runs, which is an open problem in
                 concurrent program testing. This paper makes two
                 contributions. First, it studies the interleaving
                 characteristics of the common practice in concurrent
                 program testing (i.e., running a program over and over)
                 to understand why atomicity violation bugs are hard to
                 expose. Second, it proposes CTrigger to effectively and
                 efficiently expose atomicity violation bugs in large
                 programs. CTrigger focuses on a special type of
                 interleavings (i.e., unserializable interleavings) that
                 are inherently correlated to atomicity violation bugs,
                 and uses trace analysis to systematically identify
                 (likely) feasible unserializable interleavings with low
                 occurrence-probability. CTrigger then uses minimum
                 execution perturbation to exercise low-probability
                 interleavings and expose difficult-to-catch atomicity
                 violation. We evaluate CTrigger with real-world
                 atomicity violation bugs from four sever/desktop
                 applications (Apache, MySQL, Mozilla, and PBZIP2) and
                 three SPLASH2 applications on 8-core machines. CTrigger
                 efficiently exposes the tested bugs within 1--235
                 seconds, two to four orders of magnitude faster than
                 stress testing. Without CTrigger, some of these bugs do
                 not manifest even after 7 full days of stress testing.
                 In addition, without deterministic replay support, once
                 a bug is exposed, CTrigger can help programmers
                 reliably reproduce it for diagnosis. Our tested bugs
                 are reproduced by CTrigger mostly within 5 seconds, 300
                 to over 60000 times faster than stress testing.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS 2009 conference proceedings.",
}

@Article{Sidiroglou:2009:AAS,
  author =       "Stelios Sidiroglou and Oren Laadan and Carlos Perez
                 and Nicolas Viennot and Jason Nieh and Angelos D.
                 Keromytis",
  title =        "{ASSURE}: automatic software self-healing using rescue
                 points",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "37",
  number =       "1",
  pages =        "37--48",
  month =        mar,
  year =         "2009",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2528521.1508250",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:47:19 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Software failures in server applications are a
                 significant problem for preserving system availability.
                 We present ASSURE, a system that introduces rescue
                 points that recover software from unknown faults while
                 maintaining both system integrity and availability, by
                 mimicking system behavior under known error conditions.
                 Rescue points are locations in existing application
                 code for handling a given set of programmer-anticipated
                 failures, which are automatically repurposed and tested
                 for safely enabling fault recovery from a larger class
                 of (unanticipated) faults. When a fault occurs at an
                 arbitrary location in the program, ASSURE restores
                 execution to an appropriate rescue point and induces
                 the program to recover execution by virtualizing the
                 program's existing error-handling facilities. Rescue
                 points are identified using fuzzing, implemented using
                 a fast coordinated checkpoint-restart mechanism that
                 handles multi-process and multi-threaded applications,
                 and, after testing, are injected into production code
                 using binary patching. We have implemented an ASSURE
                 Linux prototype that operates without application
                 source code and without base operating system kernel
                 changes. Our experimental results on a set of
                 real-world server applications and bugs show that
                 ASSURE enabled recovery for all of the bugs tested with
                 fast recovery times, has modest performance overhead,
                 and provides automatic self-healing orders of magnitude
                 faster than current human-driven patch deployment
                 methods.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS 2009 conference proceedings.",
}

@Article{Lenharth:2009:RDO,
  author =       "Andrew Lenharth and Vikram S. Adve and Samuel T.
                 King",
  title =        "Recovery domains: an organizing principle for
                 recoverable operating systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "37",
  number =       "1",
  pages =        "49--60",
  month =        mar,
  year =         "2009",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2528521.1508251",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:47:19 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "We describe a strategy for enabling existing commodity
                 operating systems to recover from unexpected run-time
                 errors in nearly any part of the kernel, including core
                 kernel components. Our approach is dynamic and
                 request-oriented; it isolates the effects of a fault to
                 the requests that caused the fault rather than to
                 static kernel components. This approach is based on a
                 notion of ``recovery domains,'' an organizing principle
                 to enable rollback of state affected by a request in a
                 multithreaded system with minimal impact on other
                 requests or threads. We have applied this approach on
                 v2.4.22 and v2.6.27 of the Linux kernel and it required
                 132 lines of changed or new code: the other changes are
                 all performed by a simple instrumentation pass of a
                 compiler. Our experiments show that the approach is
                 able to recover from otherwise fatal faults with
                 minimal collateral impact during a recovery event.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS 2009 conference proceedings.",
}

@Article{Dimitrov:2009:ABB,
  author =       "Martin Dimitrov and Huiyang Zhou",
  title =        "Anomaly-based bug prediction, isolation, and
                 validation: an automated approach for software
                 debugging",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "37",
  number =       "1",
  pages =        "61--72",
  month =        mar,
  year =         "2009",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2528521.1508252",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:47:19 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Software defects, commonly known as bugs, present a
                 serious challenge for system reliability and
                 dependability. Once a program failure is observed, the
                 debugging activities to locate the defects are
                 typically nontrivial and time consuming. In this paper,
                 we propose a novel automated approach to pin-point the
                 root-causes of software failures. Our proposed approach
                 consists of three steps. The first step is bug
                 prediction, which leverages the existing work on
                 anomaly-based bug detection as exceptional behavior
                 during program execution has been shown to frequently
                 point to the root cause of a software failure. The
                 second step is bug isolation, which eliminates
                 false-positive bug predictions by checking whether the
                 dynamic forward slices of bug predictions lead to the
                 observed program failure. The last step is bug
                 validation, in which the isolated anomalies are
                 validated by dynamically nullifying their effects and
                 observing if the program still fails. The whole bug
                 prediction, isolation and validation process is fully
                 automated and can be implemented with efficient
                 architectural support. Our experiments with 6 programs
                 and 7 bugs, including a real bug in the gcc 2.95.2
                 compiler, show that our approach is highly effective at
                 isolating only the relevant anomalies. Compared to
                 state-of-art debugging techniques, our proposed
                 approach pinpoints the defect locations more accurately
                 and presents the user with a much smaller code set to
                 analyze.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS 2009 conference proceedings.",
}

@Article{Montesinos:2009:CSH,
  author =       "Pablo Montesinos and Matthew Hicks and Samuel T. King
                 and Josep Torrellas",
  title =        "{Capo}: a software-hardware interface for practical
                 deterministic multiprocessor replay",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "37",
  number =       "1",
  pages =        "73--84",
  month =        mar,
  year =         "2009",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2528521.1508254",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:47:19 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "While deterministic replay of parallel programs is a
                 powerful technique, current proposals have
                 shortcomings. Specifically, software-based replay
                 systems have high overheads on multiprocessors, while
                 hardware-based proposals focus only on basic
                 hardware-level mechanisms, ignoring the overall replay
                 system. To be practical, hardware-based replay systems
                 need to support an environment with multiple parallel
                 jobs running concurrently --- some being recorded,
                 others being replayed and even others running without
                 recording or replay. Moreover, they need to manage
                 limited-size log buffers. This paper addresses these
                 shortcomings by introducing, for the first time, a set
                 of abstractions and a software-hardware interface for
                 practical hardware-assisted replay of multiprocessor
                 systems. The approach, called Capo, introduces the
                 novel abstraction of the Replay Sphere to separate the
                 responsibilities of the hardware and software
                 components of the replay system. In this paper, we also
                 design and build CapoOne, a prototype of a
                 deterministic multiprocessor replay system that
                 implements Capo using Linux and simulated DeLorean
                 hardware. Our evaluation of 4-processor executions
                 shows that CapoOne largely records with the efficiency
                 of hardware-based schemes and the flexibility of
                 software-based schemes.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS 2009 conference proceedings.",
}

@Article{Devietti:2009:DDS,
  author =       "Joseph Devietti and Brandon Lucia and Luis Ceze and
                 Mark Oskin",
  title =        "{DMP}: deterministic shared memory multiprocessing",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "37",
  number =       "1",
  pages =        "85--96",
  month =        mar,
  year =         "2009",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2528521.1508255",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:47:19 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Current shared memory multicore and multiprocessor
                 systems are nondeterministic. Each time these systems
                 execute a multithreaded application, even if supplied
                 with the same input, they can produce a different
                 output. This frustrates debugging and limits the
                 ability to properly test multithreaded code, becoming a
                 major stumbling block to the much-needed widespread
                 adoption of parallel programming. In this paper we make
                 the case for fully deterministic shared memory
                 multiprocessing (DMP). The behavior of an arbitrary
                 multithreaded program on a DMP system is only a
                 function of its inputs. The core idea is to make
                 inter-thread communication fully deterministic.
                 Previous approaches to coping with nondeterminism in
                 multithreaded programs have focused on replay, a
                 technique useful only for debugging. In contrast, while
                 DMP systems are directly useful for debugging by
                 offering repeatability by default, we argue that
                 parallel programs should execute deterministically in
                 the field as well. This has the potential to make
                 testing more assuring and increase the reliability of
                 deployed multithreaded software. We propose a range of
                 approaches to enforcing determinism and discuss their
                 implementation trade-offs. We show that determinism can
                 be provided with little performance cost using our
                 architecture proposals on future hardware, and that
                 software-only approaches can be utilized on existing
                 systems.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS 2009 conference proceedings.",
}

@Article{Olszewski:2009:KED,
  author =       "Marek Olszewski and Jason Ansel and Saman
                 Amarasinghe",
  title =        "{Kendo}: efficient deterministic multithreading in
                 software",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "37",
  number =       "1",
  pages =        "97--108",
  month =        mar,
  year =         "2009",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2528521.1508256",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:47:19 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Although chip-multiprocessors have become the industry
                 standard, developing parallel applications that target
                 them remains a daunting task. Non-determinism, inherent
                 in threaded applications, causes significant challenges
                 for parallel programmers by hindering their ability to
                 create parallel applications with repeatable results.
                 As a consequence, parallel applications are
                 significantly harder to debug, test, and maintain than
                 sequential programs. This paper introduces Kendo: a new
                 software-only system that provides deterministic
                 multithreading of parallel applications. Kendo enforces
                 a deterministic interleaving of lock acquisitions and
                 specially declared non-protected reads through a novel
                 dynamically load-balanced deterministic scheduling
                 algorithm. The algorithm tracks the progress of each
                 thread using performance counters to construct a
                 deterministic logical time that is used to compute an
                 interleaving of shared data accesses that is both
                 deterministic and provides good load balancing. Kendo
                 can run on today's commodity hardware while incurring
                 only a modest performance cost. Experimental results on
                 the SPLASH-2 applications yield a geometric mean
                 overhead of only 16\% when running on 4 processors.
                 This low overhead makes it possible to benefit from
                 Kendo even after an application is deployed.
                 Programmers can start using Kendo today to program
                 parallel applications that are easier to develop,
                 debug, and test.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS 2009 conference proceedings.",
}

@Article{Tiwari:2009:CIF,
  author =       "Mohit Tiwari and Hassan M. G. Wassel and Bita Mazloom
                 and Shashidhar Mysore and Frederic T. Chong and Timothy
                 Sherwood",
  title =        "Complete information flow tracking from the gates up",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "37",
  number =       "1",
  pages =        "109--120",
  month =        mar,
  year =         "2009",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2528521.1508258",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:47:19 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "For many mission-critical tasks, tight guarantees on
                 the flow of information are desirable, for example,
                 when handling important cryptographic keys or sensitive
                 financial data. We present a novel architecture capable
                 of tracking all information flow within the machine,
                 including all explicit data transfers and all implicit
                 flows (those subtly devious flows caused by not
                 performing conditional operations). While the problem
                 is impossible to solve in the general case, we have
                 created a machine that avoids the general-purpose
                 programmability that leads to this impossibility
                 result, yet is still programmable enough to handle a
                 variety of critical operations such as public-key
                 encryption and authentication. Through the application
                 of our novel gate-level information flow tracking
                 method, we show how all flows of information can be
                 precisely tracked. From this foundation, we then
                 describe how a class of architectures can be
                 constructed, from the gates up, to completely capture
                 all information flows and we measure the impact of
                 doing so on the hardware implementation, the ISA, and
                 the programmer.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS 2009 conference proceedings.",
}

@Article{Tam:2009:RAL,
  author =       "David K. Tam and Reza Azimi and Livio B. Soares and
                 Michael Stumm",
  title =        "{RapidMRC}: approximating {L2} miss rate curves on
                 commodity systems for online optimizations",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "37",
  number =       "1",
  pages =        "121--132",
  month =        mar,
  year =         "2009",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2528521.1508259",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:47:19 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Miss rate curves (MRCs) are useful in a number of
                 contexts. In our research, online L2 cache MRCs enable
                 us to dynamically identify optimal cache sizes when
                 cache-partitioning a shared-cache multicore processor.
                 Obtaining L2 MRCs has generally been assumed to be
                 expensive when done in software and consequently, their
                 usage for online optimizations has been limited. To
                 address these problems and opportunities, we have
                 developed a low-overhead software technique to obtain
                 L2 MRCs online on current processors, exploiting
                 features available in their performance monitoring
                 units so that no changes to the application source code
                 or binaries are required. Our technique, called
                 RapidMRC, requires a single probing period of roughly
                 221 million processor cycles (147 ms), and subsequently
                 124 million cycles (83 ms) to process the data. We
                 demonstrate its accuracy by comparing the obtained MRCs
                 to the actual L2 MRCs of 30 applications taken from
                 SPECcpu2006, SPECcpu2000, and SPECjbb2000. We show that
                 RapidMRC can be applied to sizing cache partitions,
                 helping to achieve performance improvements of up to
                 27\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS 2009 conference proceedings.",
}

@Article{Eyerman:2009:PTC,
  author =       "Stijn Eyerman and Lieven Eeckhout",
  title =        "Per-thread cycle accounting in {SMT} processors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "37",
  number =       "1",
  pages =        "133--144",
  month =        mar,
  year =         "2009",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2528521.1508260",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:47:19 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "This paper proposes a cycle accounting architecture
                 for Simultaneous Multithreading (SMT) processors that
                 estimates the execution times for each of the threads
                 had they been executed alone, while they are running
                 simultaneously on the SMT processor. This is done by
                 accounting each cycle to either a base, miss event or
                 waiting cycle component during multi-threaded
                 execution. Single-threaded alone execution time is then
                 estimated as the sum of the base and miss event
                 components; the waiting cycle component represents the
                 lost cycle count due to SMT execution. The cycle
                 accounting architecture incurs reasonable hardware cost
                 (around 1KB of storage) and estimates single-threaded
                 performance with average prediction errors around 7.2\%
                 for two-program workloads and 11.7\% for four-program
                 workloads. The cycle accounting architecture has
                 several important applications to system software and
                 its interaction with SMT hardware. For one, the
                 estimated single-thread alone execution time provides
                 an accurate picture to system software of the actually
                 consumed processor cycles per thread. The alone
                 execution time instead of the total execution time
                 (timeslice) may make system software scheduling
                 policies more effective. Second, a new class of
                 thread-progress aware SMT fetch policies based on
                 per-thread progress indicators enable system software
                 level priorities to be enforced at the hardware
                 level.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS 2009 conference proceedings.",
}

@Article{Hofmann:2009:MBM,
  author =       "Owen S. Hofmann and Christopher J. Rossbach and Emmett
                 Witchel",
  title =        "Maximum benefit from a minimal {HTM}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "37",
  number =       "1",
  pages =        "145--156",
  month =        mar,
  year =         "2009",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2528521.1508262",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:47:19 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "A minimal, bounded hardware transactional memory
                 implementation significantly improves synchronization
                 performance when used in an operating system kernel. We
                 add HTM to Linux 2.4, a kernel with a simple,
                 coarse-grained synchronization structure. The
                 transactional Linux 2.4 kernel can improve performance
                 of user programs by as much as 40\% over the
                 non-transactional 2.4 kernel. It closes 68\% of the
                 performance gap with the Linux 2.6 kernel, which has
                 had significant engineering effort applied to improve
                 scalability. We then extend our minimal HTM to a fast,
                 unbounded transactional memory with a novel technique
                 for coordinating hardware transactions and software
                 synchronization. Overflowed transactions run in
                 software, with only a minimal coupling between hardware
                 and software systems. There is no performance penalty
                 for overflow rates of less than 1\%. In one instance,
                 at 16 processors and an overflow rate of 4\%,
                 performance degrades from an ideal 4.3x to 3.6x.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS 2009 conference proceedings.",
}

@Article{Dice:2009:EEC,
  author =       "Dave Dice and Yossi Lev and Mark Moir and Daniel
                 Nussbaum",
  title =        "Early experience with a commercial hardware
                 transactional memory implementation",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "37",
  number =       "1",
  pages =        "157--168",
  month =        mar,
  year =         "2009",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2528521.1508263",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:47:19 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "We report on our experience with the hardware
                 transactional memory (HTM) feature of two
                 pre-production revisions of a new commercial multicore
                 processor. Our experience includes a number of
                 promising results using HTM to improve performance in a
                 variety of contexts, and also identifies some ways in
                 which the feature could be improved to make it even
                 better. We give detailed accounts of our experiences,
                 sharing techniques we used to achieve the results we
                 have, as well as describing challenges we faced in
                 doing so.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS 2009 conference proceedings.",
}

@Article{Wells:2009:MMM,
  author =       "Philip M. Wells and Koushik Chakraborty and Gurindar
                 S. Sohi",
  title =        "Mixed-mode multicore reliability",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "37",
  number =       "1",
  pages =        "169--180",
  month =        mar,
  year =         "2009",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2528521.1508265",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:47:19 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Future processors are expected to observe increasing
                 rates of hardware faults. Using Dual-Modular Redundancy
                 (DMR), two cores of a multicore can be loosely coupled
                 to redundantly execute a single software thread,
                 providing very high coverage from many difference
                 sources of faults. This reliability, however, comes at
                 a high price in terms of per-thread IPC and overall
                 system throughput. We make the observation that a user
                 may want to run both applications requiring high
                 reliability, such as financial software, and more fault
                 tolerant applications requiring high performance, such
                 as media or web software, on the same machine at the
                 same time. Yet a traditional DMR system must fully
                 operate in redundant mode whenever any application
                 requires high reliability. This paper proposes a
                 Mixed-Mode Multicore (MMM), which enables most
                 applications, including the system software, to run
                 with high reliability in DMR mode, while applications
                 that need high performance can avoid the penalty of
                 DMR. Though conceptually simple, two key challenges
                 arise: (1) care must be taken to protect reliable
                 applications from any faults occurring to applications
                 running in high performance mode, and (2) the desire to
                 execute additional independent software threads for a
                 performance application complicates the scheduling of
                 computation to cores. After solving these issues, an
                 MMM is shown to improve overall system performance,
                 compared to a traditional DMR system, by approximately
                 2X when one reliable and one performance application
                 are concurrently executing.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS 2009 conference proceedings.",
}

@Article{Rajamani:2009:IDE,
  author =       "Sriram Rajamani and G. Ramalingam and Venkatesh Prasad
                 Ranganath and Kapil Vaswani",
  title =        "{ISOLATOR}: dynamically ensuring isolation in
                 comcurrent programs",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "37",
  number =       "1",
  pages =        "181--192",
  month =        mar,
  year =         "2009",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2528521.1508266",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:47:19 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "In this paper, we focus on concurrent programs that
                 use locks to achieve isolation of data accessed by
                 critical sections of code. We present ISOLATOR, an
                 algorithm that guarantees isolation for well-behaved
                 threads of a program that obey a locking discipline
                 even in the presence of ill-behaved threads that
                 disobey the locking discipline. ISOLATOR uses code
                 instrumentation, data replication, and virtual memory
                 protection to detect isolation violations and delays
                 ill-behaved threads to ensure isolation. Our
                 instrumentation scheme requires access only to the code
                 of well-behaved threads. We have evaluated ISOLATOR on
                 several benchmark programs and found that ISOLATOR can
                 ensure isolation with reasonable runtime overheads. In
                 addition, we present three general desiderata ---
                 safety, isolation, and permissiveness --- for any
                 scheme that attempts to ensure isolation, and formally
                 prove that ISOLATOR satisfies all of these
                 desiderata.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS 2009 conference proceedings.",
}

@Article{Tucek:2009:EOV,
  author =       "Joseph Tucek and Weiwei Xiong and Yuanyuan Zhou",
  title =        "Efficient online validation with delta execution",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "37",
  number =       "1",
  pages =        "193--204",
  month =        mar,
  year =         "2009",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2528521.1508267",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:47:19 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Software systems are constantly changing. Patches to
                 fix bugs and patches to add features are all too
                 common. Every change risks breaking a previously
                 working system. Hence administrators loathe change, and
                 are willing to delay even critical security patches
                 until after fully validating their correctness.
                 Compared to off-line validation, on-line validation has
                 clear advantages since it tests against real life
                 workloads. Yet unfortunately it imposes restrictive
                 overheads as it requires running the old and new
                 versions side-by-side. Moreover, due to spurious
                 differences (e.g. event timing, random number
                 generation, and thread interleavings), it is difficult
                 to compare the two for validation. To allow more
                 effective on-line patch validation, we propose a new
                 mechanism, called delta execution, that is based on the
                 observation that most patches are small. Delta
                 execution merges the two side-by-side executions for
                 most of the time and splits only when necessary, such
                 as when they access different data or execute different
                 code. This allows us to perform on-line validation not
                 only with lower overhead but also with greatly reduced
                 spurious differences, allowing us to effectively
                 validate changes. We first validate the feasibility of
                 our idea by studying the characteristics of 240 patches
                 from 4 server programs; our examination shows that 77\%
                 of the changes should not be expected to cause large
                 changes and are thereby feasible for Delta execution.
                 We then implemented Delta execution using dynamic
                 instrumentation. Using real world patches from 7 server
                 applications and 3 other programs, we compared our
                 implementation of Delta execution against a traditional
                 side-by-side on-line validation. Delta execution
                 outperformed traditional validation by up to 128\%;
                 further, for 3 of the changes, spurious differences
                 caused the traditional validation to fail completely
                 while Delta execution succeeded. This demonstrates that
                 Delta execution can allow administrators to use on-line
                 validation to confidently ensure the correctness of the
                 changes they apply.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS 2009 conference proceedings.",
}

@Article{Meisner:2009:PES,
  author =       "David Meisner and Brian T. Gold and Thomas F.
                 Wenisch",
  title =        "{PowerNap}: eliminating server idle power",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "37",
  number =       "1",
  pages =        "205--216",
  month =        mar,
  year =         "2009",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2528521.1508269",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:47:19 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Data center power consumption is growing to
                 unprecedented levels: the EPA estimates U.S. data
                 centers will consume 100 billion kilowatt hours
                 annually by 2011. Much of this energy is wasted in idle
                 systems: in typical deployments, server utilization is
                 below 30\%, but idle servers still consume 60\% of
                 their peak power draw. Typical idle periods though
                 frequent--last seconds or less, confounding simple
                 energy-conservation approaches. In this paper, we
                 propose PowerNap, an energy-conservation approach where
                 the entire system transitions rapidly between a
                 high-performance active state and a near-zero-power
                 idle state in response to instantaneous load. Rather
                 than requiring fine-grained power-performance states
                 and complex load-proportional operation from each
                 system component, PowerNap instead calls for minimizing
                 idle power and transition time, which are simpler
                 optimization goals. Based on the PowerNap concept, we
                 develop requirements and outline mechanisms to
                 eliminate idle power waste in enterprise blade servers.
                 Because PowerNap operates in low-efficiency regions of
                 current blade center power supplies, we introduce the
                 Redundant Array for Inexpensive Load Sharing (RAILS), a
                 power provisioning approach that provides high
                 conversion efficiency across the entire range of
                 PowerNap's power demands. Using utilization traces
                 collected from enterprise-scale commercial deployments,
                 we demonstrate that, together, PowerNap and RAILS
                 reduce average server power consumption by 74\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS 2009 conference proceedings.",
}

@Article{Caulfield:2009:GUF,
  author =       "Adrian M. Caulfield and Laura M. Grupp and Steven
                 Swanson",
  title =        "{Gordon}: using flash memory to build fast,
                 power-efficient clusters for data-intensive
                 applications",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "37",
  number =       "1",
  pages =        "217--228",
  month =        mar,
  year =         "2009",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2528521.1508270",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:47:19 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "As our society becomes more information-driven, we
                 have begun to amass data at an astounding and
                 accelerating rate. At the same time, power concerns
                 have made it difficult to bring the necessary
                 processing power to bear on querying, processing, and
                 understanding this data. We describe Gordon, a system
                 architecture for data-centric applications that
                 combines low-power processors, flash memory, and
                 data-centric programming systems to improve performance
                 for data-centric applications while reducing power
                 consumption. The paper presents an exhaustive analysis
                 of the design space of Gordon systems, focusing on the
                 trade-offs between power, energy, and performance that
                 Gordon must make. It analyzes the impact of
                 flash-storage and the Gordon architecture on the
                 performance and power efficiency of data-centric
                 applications. It also describes a novel flash
                 translation layer tailored to data intensive workloads
                 and large flash storage arrays. Our data show that,
                 using technologies available in the near future, Gordon
                 systems can out-perform disk-based clusters by 1.5$
                 \times $ and deliver up to 2.5$ \times $ more
                 performance per Watt.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS 2009 conference proceedings.",
}

@Article{Gupta:2009:DFT,
  author =       "Aayush Gupta and Youngjae Kim and Bhuvan Urgaonkar",
  title =        "{DFTL}: a flash translation layer employing
                 demand-based selective caching of page-level address
                 mappings",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "37",
  number =       "1",
  pages =        "229--240",
  month =        mar,
  year =         "2009",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2528521.1508271",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:47:19 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Recent technological advances in the development of
                 flash-memory based devices have consolidated their
                 leadership position as the preferred storage media in
                 the embedded systems market and opened new vistas for
                 deployment in enterprise-scale storage systems. Unlike
                 hard disks, flash devices are free from any mechanical
                 moving parts, have no seek or rotational delays and
                 consume lower power. However, the internal
                 idiosyncrasies of flash technology make its performance
                 highly dependent on workload characteristics. The poor
                 performance of random writes has been a cause of major
                 concern, which needs to be addressed to better utilize
                 the potential of flash in enterprise-scale
                 environments. We examine one of the important causes of
                 this poor performance: the design of the Flash
                 Translation Layer (FTL), which performs the
                 virtual-to-physical address translations and hides the
                 erase-before-write characteristics of flash. We propose
                 a complete paradigm shift in the design of the core FTL
                 engine from the existing techniques with our
                 Demand-based Flash Translation Layer (DFTL), which
                 selectively caches page-level address mappings. We
                 develop a flash simulation framework called FlashSim.
                 Our experimental evaluation with realistic
                 enterprise-scale workloads endorses the utility of DFTL
                 in enterprise-scale storage systems by demonstrating:
                 (i) improved performance, (ii) reduced garbage
                 collection overhead and (iii) better overload behavior
                 compared to state-of-the-art FTL schemes. For example,
                 a predominantly random-write dominant I/O trace from an
                 OLTP application running at a large financial
                 institution shows a 78\% improvement in average
                 response time (due to a 3-fold reduction in operations
                 of the garbage collector), compared to a
                 state-of-the-art FTL scheme. Even for the well-known
                 read-dominant TPC-H benchmark, for which DFTL
                 introduces additional overheads, we improve system
                 response time by 56\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS 2009 conference proceedings.",
}

@Article{Aleen:2009:CAS,
  author =       "Farhana Aleen and Nathan Clark",
  title =        "Commutativity analysis for software parallelization:
                 letting program transformations see the big picture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "37",
  number =       "1",
  pages =        "241--252",
  month =        mar,
  year =         "2009",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2528521.1508273",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:47:19 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Extracting performance from many-core architectures
                 requires software engineers to create multi-threaded
                 applications, which significantly complicates the
                 already daunting task of software development. One
                 solution to this problem is automatic compile-time
                 parallelization, which can ease the burden on software
                 developers in many situations. Clearly, automatic
                 parallelization in its present form is not suitable for
                 many application domains and new compiler analyses are
                 needed address its shortcomings. In this paper, we
                 present one such analysis: a new approach for detecting
                 commutative functions. Commutative functions are
                 sections of code that can be executed in any order
                 without affecting the outcome of the application, e.g.,
                 inserting elements into a set. Previous research on
                 this topic had one significant limitation, in that the
                 results of a commutative functions must produce
                 identical memory layouts. This prevented previous
                 techniques from detecting functions like malloc, which
                 may return different pointers depending on the order in
                 which it is called, but these differing results do not
                 affect the overall output of the application. Our new
                 commutativity analysis correctly identify these
                 situations to better facilitate automatic
                 parallelization. We demonstrate that this analysis can
                 automatically extract significant amounts of
                 parallelism from many applications, and where it is
                 ineffective it can provide software developers a useful
                 list of functions that may be commutative provided
                 semantic program changes that are not automatable.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS 2009 conference proceedings.",
}

@Article{Suleman:2009:ACS,
  author =       "M. Aater Suleman and Onur Mutlu and Moinuddin K.
                 Qureshi and Yale N. Patt",
  title =        "Accelerating critical section execution with
                 asymmetric multi-core architectures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "37",
  number =       "1",
  pages =        "253--264",
  month =        mar,
  year =         "2009",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2528521.1508274",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:47:19 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "To improve the performance of a single application on
                 Chip Multiprocessors (CMPs), the application must be
                 split into threads which execute concurrently on
                 multiple cores. In multi-threaded applications,
                 critical sections are used to ensure that only one
                 thread accesses shared data at any given time. Critical
                 sections can serialize the execution of threads, which
                 significantly reduces performance and scalability. This
                 paper proposes Accelerated Critical Sections (ACS), a
                 technique that leverages the high-performance core(s)
                 of an Asymmetric Chip Multiprocessor (ACMP) to
                 accelerate the execution of critical sections. In ACS,
                 selected critical sections are executed by a
                 high-performance core, which can execute the critical
                 section faster than the other, smaller cores. As a
                 result, ACS reduces serialization: it lowers the
                 likelihood of threads waiting for a critical section to
                 finish. Our evaluation on a set of 12
                 critical-section-intensive workloads shows that ACS
                 reduces the average execution time by 34\% compared to
                 an equal-area 32T-core symmetric CMP and by 23\%
                 compared to an equal-area ACMP. Moreover, for 7 out of
                 the 12 workloads, ACS improves scalability by
                 increasing the number of threads at which performance
                 saturates.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS 2009 conference proceedings.",
}

@Article{Mytkowicz:2009:PWD,
  author =       "Todd Mytkowicz and Amer Diwan and Matthias Hauswirth
                 and Peter F. Sweeney",
  title =        "Producing wrong data without doing anything obviously
                 wrong!",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "37",
  number =       "1",
  pages =        "265--276",
  month =        mar,
  year =         "2009",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2528521.1508275",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:47:19 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "This paper presents a surprising result: changing a
                 seemingly innocuous aspect of an experimental setup can
                 cause a systems researcher to draw wrong conclusions
                 from an experiment. What appears to be an innocuous
                 aspect in the experimental setup may in fact introduce
                 a significant bias in an evaluation. This phenomenon is
                 called measurement bias in the natural and social
                 sciences. Our results demonstrate that measurement bias
                 is significant and commonplace in computer system
                 evaluation. By significant we mean that measurement
                 bias can lead to a performance analysis that either
                 over-states an effect or even yields an incorrect
                 conclusion. By commonplace we mean that measurement
                 bias occurs in all architectures that we tried (Pentium
                 4, Core 2, and m5 O3CPU), both compilers that we tried
                 (gcc and Intel's C compiler), and most of the SPEC
                 CPU2006 C programs. Thus, we cannot ignore measurement
                 bias. Nevertheless, in a literature survey of 133
                 recent papers from ASPLOS, PACT, PLDI, and CGO, we
                 determined that none of the papers with experimental
                 results adequately consider measurement bias. Inspired
                 by similar problems and their solutions in other
                 sciences, we describe and demonstrate two methods, one
                 for detecting (causal analysis) and one for avoiding
                 (setup randomization) measurement bias.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS 2009 conference proceedings.",
}

@Article{Bond:2009:LP,
  author =       "Michael D. Bond and Kathryn S. McKinley",
  title =        "Leak pruning",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "37",
  number =       "1",
  pages =        "277--288",
  month =        mar,
  year =         "2009",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2528521.1508277",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:47:19 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Managed languages improve programmer productivity with
                 type safety and garbage collection, which eliminate
                 memory errors such as dangling pointers, double frees,
                 and buffer overflows. However, because garbage
                 collection uses reachability to over-approximate live
                 objects, programs may still leak memory if programmers
                 forget to eliminate the last reference to an object
                 that will not be used again. Leaks slow programs by
                 increasing collector workload and frequency. Growing
                 leaks eventually crash programs. This paper introduces
                 leak pruning, which keeps programs running by
                 predicting and reclaiming leaked objects at run time.
                 It predicts dead objects and reclaims them based on
                 observing data structure usage patterns. Leak pruning
                 preserves semantics because it waits for heap
                 exhaustion before reclaiming objects and poisons
                 references to objects it reclaims. If the program later
                 tries to access a poisoned reference, the virtual
                 machine (VM) throws an error. We show leak pruning has
                 low overhead in a Java VM and evaluate it on 10 leaking
                 programs. Leak pruning does not help two programs,
                 executes five substantial programs 1.6-81X longer, and
                 executes three programs, including a leak in Eclipse,
                 for at least 24 hours. In the worst case, leak pruning
                 defers fatal errors. In the best case, it keeps leaky
                 programs running with preserved semantics and
                 consistent throughput.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS 2009 conference proceedings.",
}

@Article{Wegiel:2009:DPC,
  author =       "Michal Wegiel and Chandra Krintz",
  title =        "Dynamic prediction of collection yield for managed
                 runtimes",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "37",
  number =       "1",
  pages =        "289--300",
  month =        mar,
  year =         "2009",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2528521.1508278",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:47:19 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "The growth in complexity of modern systems makes it
                 increasingly difficult to extract high-performance. The
                 software stacks for such systems typically consist of
                 multiple layers and include managed runtime
                 environments (MREs). In this paper, we investigate
                 techniques to improve cooperation between these layers
                 and the hardware to increase the efficacy of automatic
                 memory management in MREs. General-purpose MREs
                 commonly implement parallel and/or concurrent garbage
                 collection and employ compaction to eliminate heap
                 fragmentation. Moreover, most systems trigger
                 collection based on the amount of heap a program uses.
                 Our analysis shows that in many cases this strategy
                 leads to ineffective collections that are unable to
                 reclaim sufficient space to justify the incurred cost.
                 To avoid such collections, we exploit the observation
                 that dead objects tend to cluster together and form
                 large, never-referenced, regions in the address space
                 that correlate well with virtual pages that have not
                 recently been referenced by the application. We
                 leverage this correlation to design a new, simple and
                 light-weight, yield predictor that estimates the amount
                 of reclaimable space in the heap using hardware page
                 reference bits. Our predictor allows MREs to avoid
                 low-yield collections and thereby improve resource
                 management. We integrate this predictor into three
                 state-of-the-art parallel compactors, implemented in
                 the HotSpot JVM, that represent distinct canonical heap
                 layouts. Our empirical evaluation, based on standard
                 Java benchmarks and open-source applications, indicates
                 that inexpensive and accurate yield prediction can
                 improve performance significantly.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS 2009 conference proceedings.",
}

@Article{Menon:2009:TSA,
  author =       "Aravind Menon and Simon Schubert and Willy
                 Zwaenepoel",
  title =        "{TwinDrivers}: semi-automatic derivation of fast and
                 safe hypervisor network drivers from guest {OS}
                 drivers",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "37",
  number =       "1",
  pages =        "301--312",
  month =        mar,
  year =         "2009",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2528521.1508279",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:47:19 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "In a virtualized environment, device drivers are often
                 run inside a virtual machine (VM) rather than in the
                 hypervisor, for reasons of safety and reduction in
                 software engineering effort. Unfortunately, this
                 approach results in poor performance for I/O-intensive
                 devices such as network cards. The alternative approach
                 of running device drivers directly in the hypervisor
                 yields better performance, but results in the loss of
                 safety guarantees for the hypervisor and incurs
                 additional software engineering costs. In this paper we
                 present TwinDrivers, a framework which allows us to
                 semi-automatically create safe and efficient hypervisor
                 drivers from guest OS drivers. The hypervisor driver
                 runs directly in the hypervisor, but its data resides
                 completely in the driver VM address space. A Software
                 Virtual Memory mechanism allows the driver to access
                 its VM data efficiently from the hypervisor running in
                 any guest context, and also protects the hypervisor
                 from invalid memory accesses from the driver. An upcall
                 mechanism allows the hypervisor to largely reuse the
                 driver support infrastructure present in the VM. The
                 TwinDriver system thus combines most of the performance
                 benefits of hypervisor-based driver approaches with the
                 safety and software engineering benefits of VM-based
                 driver approaches. Using the TwinDrivers hypervisor
                 driver, we are able to improve the guest domain
                 networking throughput in Xen by a factor of 2.4 for
                 transmit workloads, and 2.1 for receive workloads, both
                 in CPU-scaled units, and achieve close to 64-67 of
                 native Linux throughput.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS 2009 conference proceedings.",
}

@Article{Burcea:2009:PBV,
  author =       "Ioana Burcea and Andreas Moshovos",
  title =        "{Phantom-BTB}: a virtualized branch target buffer
                 design",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "37",
  number =       "1",
  pages =        "313--324",
  month =        mar,
  year =         "2009",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2528521.1508281",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:47:19 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Modern processors use branch target buffers (BTBs) to
                 predict the target address of branches such that they
                 can fetch ahead in the instruction stream increasing
                 concurrency and performance. Ideally, BTBs would be
                 sufficiently large to capture the entire working set of
                 the application and sufficiently small for fast access
                 and practical on-chip dedicated storage. Depending on
                 the application, these requirements are at odds. This
                 work introduces a BTB design that accommodates large
                 instruction footprints without dedicating expensive
                 onchip resources. In the proposed Phantom-BTB (PBTB)
                 design, a conventional BTB is augmented with a virtual
                 table that collects branch target information as the
                 application runs. The virtual table does not have fixed
                 dedicated storage. Instead, it is transparently
                 allocated, on demand, in the on-chip caches, at cache
                 line granularity. The entries in the virtual table are
                 proactively prefetched and installed in the dedicated
                 conventional BTB, thus, increasing its perceived
                 capacity. Experimental results with commercial
                 workloads under full-system simulation demonstrate that
                 PBTB improves IPC performance over a 1K-entry BTB by
                 6.9\% on average and up to 12.7\%, with a storage
                 overhead of only 8\%. Overall, the virtualized design
                 performs within 1\% of a conventional 4K-entry,
                 single-cycle access BTB, while the dedicated storage is
                 3.6 times smaller.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS 2009 conference proceedings.",
}

@Article{Ramani:2009:SSF,
  author =       "Karthik Ramani and Christiaan P. Gribble and Al
                 Davis",
  title =        "{StreamRay}: a stream filtering architecture for
                 coherent ray tracing",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "37",
  number =       "1",
  pages =        "325--336",
  month =        mar,
  year =         "2009",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2528521.1508282",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:47:19 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "The wide availability of commodity graphics processors
                 has made real-time graphics an intrinsic component of
                 the human/computer interface. These graphics cores
                 accelerate the z-buffer algorithm and provide a highly
                 interactive experience at a relatively low cost.
                 However, many applications in entertainment, science,
                 and industry require high quality lighting effects such
                 as accurate shadows, reflection, and refraction. These
                 effects can be difficult to achieve with z-buffer
                 algorithms but are straightforward to implement using
                 ray tracing. Although ray tracing is computationally
                 more complex, the algorithm exhibits excellent scaling
                 and parallelism properties. Nevertheless, ray tracing
                 memory access patterns are difficult to predict and the
                 parallelism speedup promise is therefore hard to
                 achieve. This paper highlights a novel approach to ray
                 tracing based on stream filtering and presents
                 StreamRay, a multicore wide SIMD microarchitecture that
                 delivers interactive frame rates of 15-32 frames/second
                 for scenes of high geometric complexity and exhibits
                 high utilization for SIMD widths ranging from eight to
                 16 elements. StreamRay consists of two main components:
                 the ray engine, which is responsible for stream
                 assembly and employs address generation units that
                 generate addresses to form large SIMD vectors, and the
                 filter engine, which implements the ray tracing
                 operations with programmable accelerators. Results
                 demonstrate that separating address and data processing
                 reduces data movement and resource contention.
                 Performance improves by 56\% while simultaneously
                 providing 11.63\% power savings per accelerator core
                 compared to a design which does not use separate
                 resources for address and data computations.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS 2009 conference proceedings.",
}

@Article{Cameron:2009:ASS,
  author =       "Robert D. Cameron and Dan Lin",
  title =        "Architectural support for {SWAR} text processing with
                 parallel bit streams: the inductive doubling
                 principle",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "37",
  number =       "1",
  pages =        "337--348",
  month =        mar,
  year =         "2009",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2528521.1508283",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:47:19 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Parallel bit stream algorithms exploit the SWAR (SIMD
                 within a register) capabilities of commodity processors
                 in high-performance text processing applications such
                 as UTF-8 to UTF-16 transcoding, XML parsing, string
                 search and regular expression matching. Direct
                 architectural support for these algorithms in future
                 SWAR instruction sets could further increase
                 performance as well as simplifying the programming
                 task. A set of simple SWAR instruction set extensions
                 are proposed for this purpose based on the principle of
                 systematic support for inductive doubling as an
                 algorithmic technique. These extensions are shown to
                 significantly reduce instruction count in core parallel
                 bit stream algorithms, often providing a 3X or better
                 improvement. The extensions are also shown to be useful
                 for SWAR programming in other application areas,
                 including providing a systematic treatment for
                 horizontal operations. An implementation model for
                 these extensions involves relatively simple circuitry
                 added to the operand fetch components in a pipelined
                 processor.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS 2009 conference proceedings.",
}

@Article{Jouppi:2009:ISI,
  author =       "Norman P. Jouppi and Rakesh Kumar and Dean Tullsen",
  title =        "Introduction to the special issue on the {2008
                 Workshop on Design, Analysis, and Simulation of Chip
                 Multiprocessors (dasCMP'08)}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "37",
  number =       "2",
  pages =        "1--1",
  month =        may,
  year =         "2009",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1577129.1577131",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Aug 11 18:12:39 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Zeng:2009:MCA,
  author =       "Hui Zeng and Matt Yourst and Kanad Ghose and Dmitry
                 Ponomarev",
  title =        "{MPTLsim}: a cycle-accurate, full-system simulator for
                 x86-64 multicore architectures with coherent caches",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "37",
  number =       "2",
  pages =        "2--9",
  month =        may,
  year =         "2009",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1577129.1577132",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Aug 11 18:12:39 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "The introduction of multicore microprocessors in the
                 recent years has made it imperative to use
                 cycle-accurate and full-system simulators in the
                 architecture research community. We introduce MPTLsim a
                 multicore simulator for the X86 ISA that meets this
                 need. MPTLsim is a uop-accurate, cycle-accurate,
                 full-system simulator for multicore designs based on
                 the X86-64 ISA. MPTLsim extends PTLsim, a publicly
                 available single core simulator, with a host of
                 additional features to support hyperthreading within a
                 core and multiple cores, with detailed models for
                 caches, on-chip interconnections and the memory data
                 flow. MPTLsim incorporates detailed simulation models
                 for cache controllers, interconnections and has
                 built-in implementations of a number of cache coherency
                 protocols.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Monchiero:2009:HSC,
  author =       "Matteo Monchiero and Jung Ho Ahn and Ayose Falc{\'o}n
                 and Daniel Ortega and Paolo Faraboschi",
  title =        "How to simulate 1000 cores",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "37",
  number =       "2",
  pages =        "10--19",
  month =        may,
  year =         "2009",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1577129.1577133",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Aug 11 18:12:39 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "This paper proposes a novel methodology to efficiently
                 simulate shared-memory multiprocessors composed of
                 hundreds of cores. The basic idea is to use
                 thread-level parallelism in the software system and
                 translate it into core-level parallelism in the
                 simulated world. To achieve this, we first augment an
                 existing full-system simulator to identify and separate
                 the instruction streams belonging to the different
                 software threads. Then, the simulator dynamically maps
                 each instruction flow to the corresponding core of the
                 target multi-core architecture, taking into account the
                 inherent thread synchronization of the running
                 applications. Our simulator allows a user to execute
                 any multithreaded application in a conventional
                 full-system simulator and evaluate the performance of
                 the application on a many-core hardware. We carried out
                 extensive simulations on the SPLASH-2 benchmark suite
                 and demonstrated the scalability up to 1024 cores with
                 limited simulation speed degradation vs. the
                 single-core case on a fixed workload. The results also
                 show that the proposed technique captures the intrinsic
                 behavior of the SPLASH-2 suite, even when we scale up
                 the number of shared-memory cores beyond the
                 thousand-core limit.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Chen:2009:SPP,
  author =       "Jianwei Chen and Murali Annavaram and Michel Dubois",
  title =        "{SlackSim}: a platform for parallel simulations of
                 {CMPs} on {CMPs}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "37",
  number =       "2",
  pages =        "20--29",
  month =        may,
  year =         "2009",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1577129.1577134",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Aug 11 18:12:39 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "The fast simulation of chip multiprocessors (CMPs)
                 presents a critical challenge to the architecture
                 research community as both industry and academia shift
                 their research focus to multicore design. Parallel
                 simulation is a technique to accelerate
                 microarchitecture simulation of CMPs by exploiting the
                 inherent parallelism of CMPs. In this paper, we explore
                 the simulation paradigm of simulating each core of a
                 target CMP in one thread and then spreading the threads
                 across the hardware thread contexts of a host CMP. We
                 implement several parallel simulation schemes using
                 POSIX Threads (Pthreads). We start with cycle-by-cycle
                 simulation and then relax the synchronization condition
                 in various schemes, which we call slack
                 simulations.\par

                 In slack simulations, the Pthreads simulating different
                 simulated cores do not synchronize after each simulated
                 cycle, but rather they are given some slack. The slack
                 is the difference in cycle between the simulated times
                 of any two target cores. Small slacks, such as a few
                 cycles, greatly improve the efficiency of parallel CMP
                 simulations, with no or negligible simulation error. We
                 have developed a simulation framework called SlackSim
                 to experiment with various slack simulation schemes.
                 Unlike previous attempts to parallelize multiprocessor
                 simulations on distributed memory machines, SlackSim
                 takes advantage of the efficient sharing of data in the
                 host CMP architecture.\par

                 We demonstrate the efficiency and accuracy of some well
                 known slack simulation schemes and of some new ones on
                 SlackSim running on a state-of-the-art CMP platform.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Purnaprajna:2009:RTR,
  author =       "Madhura Purnaprajna and Mario Porrmann and Ulrich
                 Rueckert",
  title =        "Run-time reconfigurability in embedded
                 multiprocessors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "37",
  number =       "2",
  pages =        "30--37",
  month =        may,
  year =         "2009",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1577129.1577135",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Aug 11 18:12:39 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "To meet application-specific performance demands,
                 architectures are predominantly redesigned and
                 customised. Every architectural change results in huge
                 overheads in design, verification, and fabrication,
                 which together result in prolonged time-to-market. As
                 an alternative, configurable architectures provide easy
                 adaptability to different application domains in place
                 of costly redesigns. To deal with application changes
                 and custom requirements, a method of configuring and
                 reusing the basic building blocks within processors is
                 developed. Additionally, this enables co-operative
                 multiprocessing. In this paper, a runtime
                 reconfiguration mechanism for embedded multiprocessor
                 architectures is proposed as a method to introduce
                 customisations in the post-fabrication phase. A method
                 of application description in conjunction with a
                 flexible reconfigurable multiprocessor template is
                 presented. Finally, the costs and benefits of this
                 approach are analysed for computationally intensive
                 algorithms used in digital signal processing. The
                 impact of application specific characteristics on
                 execution time, power consumption, and total energy
                 dissipation are analysed.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Jesshope:2009:ISM,
  author =       "Chris Jesshope and Mike Lankamp and Li Zhang",
  title =        "The implementation of an {SVP} many-core processor and
                 the evaluation of its memory architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "37",
  number =       "2",
  pages =        "38--45",
  month =        may,
  year =         "2009",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1577129.1577136",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Aug 11 18:12:39 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Many-core processor architectures require scalable
                 solutions that reflect the locality and power
                 constraints of future generations of silicon
                 technology. This paper presents a many-core processor
                 that supports an abstract model of concurrency, based
                 on a Self-adaptive Virtual Processor (SVP). This
                 processor implements instructions, which automatically
                 map and schedule threads providing a code devoid of any
                 explicit communication. The thrust of this approach is
                 to produce binary code that is divorced from
                 implementation parameters, yet, which still gives good
                 performance over future generations of CMPs. A key
                 component of this processor architecture is the memory
                 system. This paper briefly presents the model and
                 evaluates its memory architecture.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Singh:2009:RTP,
  author =       "Karan Singh and Major Bhadauria and Sally A. McKee",
  title =        "Real time power estimation and thread scheduling via
                 performance counters",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "37",
  number =       "2",
  pages =        "46--55",
  month =        may,
  year =         "2009",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1577129.1577137",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Aug 11 18:12:39 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Estimating power consumption is critical for hardware
                 and software developers, and of the latter,
                 particularly for OS programmers writing process
                 schedulers. However, obtaining processor and system
                 power consumption information can be non-trivial.
                 Simulators are time consuming and prone to error. Power
                 meters report whole-system consumption, but cannot give
                 per-processor or per-thread information. More intrusive
                 hardware instrumentation is possible, but such
                 solutions are usually employed while designing the
                 system, and are not meant for customer use.\par

                 Given these difficulties, plus the current availability
                 of some form of performance counters on virtually all
                 platforms (even though such counters were initially
                 designed for system bring-up, and not intended for
                 general programmer consumption), we analytically derive
                 functions for real-time estimation of processor and
                 system power consumption using performance counter data
                 on real hardware. Our model uses data gathered from
                 microbenchmarks that capture potential application
                 behavior. The model is independent of our test
                 benchmarks, and thus we expect it to be well suited for
                 future applications. We target chip multiprocessors,
                 analyzing effects of shared resources and temperature
                 on power estimation, leveraging our model to implement
                 a simple, power-aware thread scheduler. The NAS and
                 SPEC-OMP benchmarks shows a median error of 5.8\% and
                 3.9\%, respectively. SPEC 2006 shows a marginally
                 higher median error of 7.2\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Azizi:2009:AEC,
  author =       "Omid Azizi and Aqeel Mahesri and Sanjay J. Patel and
                 Mark Horowitz",
  title =        "Area-efficiency in {CMP} core design: co-optimization
                 of microarchitecture and physical design",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "37",
  number =       "2",
  pages =        "56--65",
  month =        may,
  year =         "2009",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1577129.1577138",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Aug 11 18:12:39 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "In this paper, we examine the area-performance design
                 space of a processing core for a chip multiprocessor
                 (CMP), considering both the architectural design space
                 and the tradeoffs of the physical design on which the
                 architecture relies. We first propose a methodology for
                 performing an integrated optimization of both the
                 micro-architecture and the physical circuit design of a
                 microprocessor. In our approach, we use statistical and
                 convex fitting methods to capture a large
                 micro-architectural design space. We then characterize
                 the area-delay tradeoffs of the underlying circuits
                 through RTL synthesis. Finally, we establish the
                 relationship between the architecture and the circuits
                 in an integrative model, which we use to optimize the
                 processor. As a case study, we apply this methodology
                 to explore the performance-area tradeoffs in a highly
                 parallel accelerator architecture for visual computing
                 applications. Based on some early circuit tradeoff
                 data, our results indicate that two separate designs
                 are performance/area optimal for our set of benchmarks:
                 a simpler single-issue, 2-way multithreaded core
                 running at high-frequency, and a more aggressively
                 tuned dual-issue 4-way multithreaded design running at
                 a lower frequency.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Thorson:2009:INa,
  author =       "Mark Thorson",
  title =        "{Internet} nuggets",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "37",
  number =       "2",
  pages =        "66--69",
  month =        may,
  year =         "2009",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1577129.1577140",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Aug 11 18:12:39 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Yelick:2009:TWW,
  author =       "Katherine Yelick",
  title =        "Ten ways to waste a parallel computer",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "37",
  number =       "3",
  pages =        "1--1",
  month =        jun,
  year =         "2009",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1555754.1555755",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Aug 11 18:12:55 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "As clock speed increases taper off and hardware
                 designers struggle to scale parallelism within a chip,
                 software developers and researchers must face the
                 challenge of writing portable software with no clear
                 architectural target. On the hardware side, energy
                 considerations will dominate many of the design
                 decisions, and will ultimately limit what systems and
                 applications can be built. This is especially true at
                 the high end, where the next major milestone of
                 exascale computing will be unattainable without major
                 improvements in efficiency.\par

                 Although hardware designers have long worried about the
                 efficiency of their designs, especially for
                 battery-operated devices, software developers in
                 general have not. To illustrate this point, I will
                 describe some of the top ways to waste time and
                 therefore energy waiting for communication,
                 synchronization, or interactions with users or other
                 systems. Data movement, rather than computation, is the
                 big consumer of energy, yet software often moves data
                 up and down the memory hierarchy or across a network
                 multiple times. At the same time, hardware designers
                 need to take into account the constraints of the
                 computational problems that will run on their systems,
                 as a design that is poorly matched to the computational
                 requirements will end up being inefficient. Drawing on
                 my own experience in scientific computing, I will give
                 examples of how to make the combination of hardware,
                 algorithms and software more efficient, but also
                 describe some of the challenges that are inherent in
                 the application problems we want to solve. The
                 community needs to take an integrated approach to the
                 problem, and consider how much business or science can
                 be done per Joule, rather than optimizing a particular
                 component of the system in isolation. This will require
                 rethinking the algorithms, programming models, and
                 hardware in concert, and therefore an unprecedented
                 level of collaboration and cooperation between hardware
                 and software designers.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "energy; parallel computer",
}

@Article{Lee:2009:APC,
  author =       "Benjamin C. Lee and Engin Ipek and Onur Mutlu and Doug
                 Burger",
  title =        "Architecting phase change memory as a scalable {DRAM}
                 alternative",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "37",
  number =       "3",
  pages =        "2--13",
  month =        jun,
  year =         "2009",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1555754.1555758",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Aug 11 18:12:55 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Memory scaling is in jeopardy as charge storage and
                 sensing mechanisms become less reliable for prevalent
                 memory technologies, such as DRAM. In contrast, phase
                 change memory (PCM) storage relies on scalable current
                 and thermal mechanisms. To exploit PCM's scalability as
                 a DRAM alternative, PCM must be architected to address
                 relatively long latencies, high energy writes, and
                 finite endurance.\par

                 We propose, crafted from a fundamental understanding of
                 PCM technology parameters, area-neutral architectural
                 enhancements that address these limitations and make
                 PCM competitive with DRAM. A baseline PCM system is
                 1.6x slower and requires 2.2x more energy than a DRAM
                 system. Buffer reorganizations reduce this delay and
                 energy gap to 1.2x and 1.0x, using narrow rows to
                 mitigate write energy and multiple rows to improve
                 locality and write coalescing. Partial writes enhance
                 memory endurance, providing 5.6 years of lifetime.
                 Process scaling will further reduce PCM energy costs
                 and improve endurance.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "DRAM alternative; endurance; energy; PCM; performance;
                 phase change memory; power; scalability",
}

@Article{Zhou:2009:DEE,
  author =       "Ping Zhou and Bo Zhao and Jun Yang and Youtao Zhang",
  title =        "A durable and energy efficient main memory using phase
                 change memory technology",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "37",
  number =       "3",
  pages =        "14--23",
  month =        jun,
  year =         "2009",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1555754.1555759",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Aug 11 18:12:55 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Using nonvolatile memories in memory hierarchy has
                 been investigated to reduce its energy consumption
                 because nonvolatile memories consume zero leakage power
                 in memory cells. One of the difficulties is, however,
                 that the endurance of most nonvolatile memory
                 technologies is much shorter than the conventional SRAM
                 and DRAM technology. This has limited its usage to only
                 the low levels of a memory hierarchy, e.g., disks, that
                 is far from the CPU.\par

                 In this paper, we study the use of a new type of
                 nonvolatile memories -- the Phase Change Memory (PCM)
                 as the main memory for a 3D stacked chip. The main
                 challenges we face are the limited PCM endurance,
                 longer access latencies, and higher dynamic power
                 compared to the conventional DRAM technology. We
                 propose techniques to extend the endurance of the PCM
                 to an average of 13 (for MLC PCM cell) to 22 (for SLC
                 PCM) years. We also study the design choices of
                 implementing PCM to achieve the best tradeoff between
                 energy and performance. Our design reduced the total
                 energy of an already low-power DRAM main memory of the
                 same capacity by 65\%, and energy-delay$^2$ product by
                 60\%. These results indicate that it is feasible to use
                 PCM technology in place of DRAM in the main memory for
                 better energy efficiency.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "endurance; low power; phase change memory",
}

@Article{Qureshi:2009:SHP,
  author =       "Moinuddin K. Qureshi and Vijayalakshmi Srinivasan and
                 Jude A. Rivers",
  title =        "Scalable high performance main memory system using
                 phase-change memory technology",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "37",
  number =       "3",
  pages =        "24--33",
  month =        jun,
  year =         "2009",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1555754.1555760",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Aug 11 18:12:55 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "The memory subsystem accounts for a significant cost
                 and power budget of a computer system. Current
                 DRAM-based main memory systems are starting to hit the
                 power and cost limit. An alternative memory technology
                 that uses resistance contrast in phase-change materials
                 is being actively investigated in the circuits
                 community. {\em Phase Change Memory (PCM)\/} devices
                 offer more density relative to DRAM, and can help
                 increase main memory capacity of future systems while
                 remaining within the cost and power constraints.\par

                 In this paper, we analyze a PCM-based hybrid main
                 memory system using an architecture level model of PCM.
                 We explore the trade-offs for a main memory system
                 consisting of PCMstorage coupled with a small DRAM
                 buffer. Such an architecture has the latency benefits
                 of DRAM and the capacity benefits of PCM. Our
                 evaluations for a baseline system of 16-cores with 8GB
                 DRAM show that, on average, PCM can reduce page faults
                 by 5X and provide a speedup of 3X. As PCM is projected
                 to have limited write endurance, we also propose simple
                 organizational and management solutions of the hybrid
                 memory that reduces the write traffic to PCM, boosting
                 its lifetime from 3 years to 9.7 years.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "DRAM caching; phase change memory; wear leveling",
}

@Article{Wu:2009:HCA,
  author =       "Xiaoxia Wu and Jian Li and Lixin Zhang and Evan
                 Speight and Ram Rajamony and Yuan Xie",
  title =        "Hybrid cache architecture with disparate memory
                 technologies",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "37",
  number =       "3",
  pages =        "34--45",
  month =        jun,
  year =         "2009",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1555754.1555761",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Aug 11 18:12:55 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Caching techniques have been an efficient mechanism
                 for mitigating the effects of the processor-memory
                 speed gap. Traditional multi-level SRAM-based cache
                 hierarchies, especially in the context of chip
                 multiprocessors (CMPs), present many challenges in area
                 requirements, core-to-cache balance, power consumption,
                 and design complexity. New advancements in technology
                 enable caches to be built from other technologies, such
                 as Embedded DRAM (EDRAM), Magnetic RAM (MRAM), and
                 Phase-change RAM (PRAM), in both 2D chips or 3D stacked
                 chips. Caches fabricated in these technologies offer
                 dramatically different power and performance
                 characteristics when compared with SRAM-based caches,
                 particularly in the areas of access latency, cell
                 density, and overall power consumption. In this paper,
                 we propose to take advantage of the best
                 characteristics that each technology offers, through
                 the use of Hybrid Cache Architecture (HCA) designs. We
                 discuss and evaluate two types of hybrid cache
                 architectures: inter cache Level HCA (LHCA), in which
                 the levels in a cache hierarchy can be made of
                 disparate memory technologies; and intra cache level or
                 cache Region based HCA (RHCA), where a single level of
                 cache can be partitioned into multiple regions, each of
                 a different memory technology. We have studied a number
                 of different HCA architectures and explored the
                 potential of hardware support for intra-cache data
                 movement and power consumption management within HCA
                 caches. Utilizing a full-system simulator that has been
                 validated against real hardware, we demonstrate that an
                 LHCA design can provide a geometric mean 7\% IPC
                 improvement over a baseline 3-level SRAM cache design
                 under the same area constraint across a collection of
                 25 workloads. A more aggressive RHCA-based design
                 provides 12\% IPC improvement over the baseline.
                 Finally, a 2-layer 3D cache stack (3DHCA) of high
                 density memory technology within the same chip
                 footprint gives 18\% IPC improvement over the baseline.
                 Furthermore, up to 70\% reduction in power consumption
                 over a baseline SRAM-only design is achieved.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "hybrid cache architecture; three-dimensional IC",
}

@Article{Suh:2009:DMR,
  author =       "Jinho Suh and Michel Dubois",
  title =        "Dynamic {MIPS} rate stabilization in out-of-order
                 processors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "37",
  number =       "3",
  pages =        "46--56",
  month =        jun,
  year =         "2009",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1555815.1555763",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Aug 11 18:12:55 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Today's microprocessor cores reach high performance
                 levels not only by their high clock rate but also by
                 the concurrent execution of a large number of
                 instructions. Because of the relationship between power
                 and frequency, it becomes attractive to run an OoO
                 (Out-of-Order) core at a frequency lower than its
                 nominal frequency in the context of embedded or
                 real-time systems. Unfortunately, whereas OoO pipelines
                 have high average throughput, their highly variable and
                 hard-to-predict execution rate makes them unsuitable
                 for real-time systems with hard or even soft deadlines.
                 In this paper, we demonstrate that the execution time
                 of an OoO processor can be stable and predictable by
                 controlling its MIPS (Mega Instructions Per Second)
                 rate via a PID (Proportional, Integral, and
                 Differential gain) feedback controller and DVFS
                 (Dynamic Voltage and Frequency Scaling). The stabilized
                 processor uses much less power per committed
                 instruction, because of the reduced average frequency.
                 The EPI (Energy Per Instruction) is also cut by an
                 average of 28\% across our benchmark programs. Since a
                 stable MIPS rate is maintained consistently with lower
                 power/energy per instruction, OoO processors stabilized
                 by a feedback controller can realistically be deployed
                 in real-time systems. To demonstrate this capability we
                 select a subset of the MiBench benchmarks that displays
                 the widest execution rate variations and stabilize
                 their MIPS rate in the context of a 1GHz Pentium
                 III-like microarchitecture.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "embedded systems; OoO processors; real-time systems;
                 stabilization; variability",
}

@Article{Paolieri:2009:HSW,
  author =       "Marco Paolieri and Eduardo Qui{\~n}ones and Francisco
                 J. Cazorla and Guillem Bernat and Mateo Valero",
  title =        "Hardware support for {WCET} analysis of hard real-time
                 multicore systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "37",
  number =       "3",
  pages =        "57--68",
  month =        jun,
  year =         "2009",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1555815.1555764",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Aug 11 18:12:55 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "The increasing demand for new functionalities in
                 current and future hard real-time embedded systems like
                 automotive, avionics and space industries is driving an
                 increase in the performance required in embedded
                 processors. Multicore processors represent a good
                 design solution for such systems due to their high
                 performance, low cost and power consumption
                 characteristics. However, hard real-time embedded
                 systems require time analyzability and current
                 multicore processors are less analyzable than
                 single-core processors due to the interferences between
                 different tasks when accessing shared hardware
                 resources. In this paper we propose a multicore
                 architecture with shared resources that allows the
                 execution of applications with hard real-time and non
                 hard real-time constraints at the same time, providing
                 time analizability for the hard real-time tasks so that
                 they can meet their deadlines. Moreover our
                 architecture proposal provides high-performance for the
                 non hard real-time tasks.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "analyzability; cache partitioning; hard real-time;
                 interconnection network; multicore; real-time embedded
                 systems; WCET",
}

@Article{Somogyi:2009:STM,
  author =       "Stephen Somogyi and Thomas F. Wenisch and Anastasia
                 Ailamaki and Babak Falsafi",
  title =        "Spatio-temporal memory streaming",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "37",
  number =       "3",
  pages =        "69--80",
  month =        jun,
  year =         "2009",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1555815.1555766",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Aug 11 18:12:55 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Recent research advocates memory streaming techniques
                 to alleviate the performance bottleneck caused by the
                 high latencies of off-chip memory accesses. Temporal
                 memory streaming replays previously observed miss
                 sequences to eliminate long chains of dependent misses.
                 Spatial memory streaming predicts repetitive data
                 layout patterns within fixed-size memory regions.
                 Because each technique targets a different subset of
                 misses, their effectiveness varies across workloads and
                 each leaves a significant fraction of misses
                 unpredicted.\par

                 In this paper, we propose Spatio-Temporal Memory
                 Streaming (STeMS) to exploit the synergy between
                 spatial and temporal streaming. We observe that the
                 order of spatial accesses repeats both within and
                 across regions. STeMS records and replays the temporal
                 sequence of region accesses and uses spatial
                 relationships within each region to dynamically
                 reconstruct a predicted total miss order. Using
                 trace-driven and cycle-accurate simulation across a
                 suite of commercial workloads, we demonstrate that with
                 similar implementation complexity as temporal
                 streaming, STeMS achieves equal or higher coverage than
                 spatial or temporal memory streaming alone, and
                 improves performance by 31\%, 3\%, and 18\% over
                 stride, spatial, and temporal prediction,
                 respectively.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "prefetching; spatial correlation; temporal
                 correlation",
}

@Article{Diaz:2009:SCE,
  author =       "Pedro Diaz and Marcelo Cintra",
  title =        "Stream chaining: exploiting multiple levels of
                 correlation in data prefetching",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "37",
  number =       "3",
  pages =        "81--92",
  month =        jun,
  year =         "2009",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1555815.1555767",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Aug 11 18:12:55 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Data prefetching has long been an important technique
                 to amortize the effects of the memory wall, and is
                 likely to remain so in the current era of multi-core
                 systems. Most prefetchers operate by identifying
                 patterns and correlations in the miss address stream.
                 Separating streams according to the memory access
                 instruction that generates the misses is an effective
                 way of filtering out spurious addresses from
                 predictable streams. On the other hand, by localizing
                 streams based on the memory access instructions, such
                 prefetchers both lose the complete time sequence
                 information of misses and can only issue prefetches for
                 a single memory access instruction at a time.\par

                 This paper proposes a novel class of prefetchers based
                 on the idea of linking various localized streams into
                 predictable chains of missing memory access
                 instructions such that the prefetcher can issue
                 prefetches along multiple streams. In this way the
                 prefetcher is not limited to prefetching deeply for a
                 single missing memory access instruction but can
                 instead adaptively prefetch for other memory access
                 instructions closer in time.\par

                 Experimental results show that the proposed prefetcher
                 consistently achieves better performance than a
                 state-of-the-art prefetcher -- 10\% on average, being
                 only outperformed in very few cases and then by only
                 2\%, and outperforming that prefetcher by as much as
                 55\% -- while consuming the same amount of memory
                 bandwidth.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "data prefetching",
}

@Article{Powell:2009:ACS,
  author =       "Michael D. Powell and Arijit Biswas and Shantanu Gupta
                 and Shubhendu S. Mukherjee",
  title =        "Architectural core salvaging in a multi-core processor
                 for hard-error tolerance",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "37",
  number =       "3",
  pages =        "93--104",
  month =        jun,
  year =         "2009",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1555754.1555769",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Aug 11 18:12:55 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "The incidence of hard errors in CPUs is a challenge
                 for future multicore designs due to increasing total
                 core area. Even if the location and nature of hard
                 errors are known a priori, either at manufacture-time
                 or in the field, cores with such errors must be
                 disabled in the absence of hard-error tolerance. While
                 caches, with their regular and repetitive structures,
                 are easily covered against hard errors by providing
                 spare arrays or spare lines, structures within a core
                 are neither as regular nor as repetitive. Previous work
                 has proposed microarchitectural core salvaging to
                 exploit structural redundancy within a core and
                 maintain functionality in the presence of hard errors.
                 Unfortunately microarchitectural salvaging introduces
                 complexity and may provide only limited coverage of
                 core area against hard errors due to a lack of natural
                 redundancy in the core.\par

                 This paper makes a case for architectural core
                 salvaging. We observe that even if some individual
                 cores cannot execute certain operations, a CPU die can
                 be instruction-set-architecture (ISA) compliant, that
                 is execute all of the instructions required by its ISA,
                 by exploiting natural cross-core redundancy. We propose
                 using hardware to migrate offending threads to another
                 core that can execute the operation. Architectural core
                 salvaging can cover a large core area against faults,
                 and be implemented by leveraging known techniques that
                 minimize changes to the microarchitecture. We show it
                 is possible to optimize architectural core salvaging
                 such that the performance on a faulty die approaches
                 that of a fault-free die--assuring significantly better
                 performance than core disabling for many workloads and
                 no worse performance than core disabling for the
                 remainder.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "core salvaging; hard errors; redundancy; reliability",
}

@Article{Carretero:2009:EER,
  author =       "Javier Carretero and Pedro Chaparro and Xavier Vera
                 and Jaume Abella and Antonio Gonz{\'a}lez",
  title =        "End-to-end register data-flow continuous self-test",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "37",
  number =       "3",
  pages =        "105--115",
  month =        jun,
  year =         "2009",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1555754.1555770",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Aug 11 18:12:55 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "While Moore's Law predicts the ability of
                 semi-conductor industry to engineer smaller and more
                 efficient transistors and circuits, there are serious
                 issues not contemplated in that law. One concern is the
                 verification effort of modern computing systems, which
                 has grown to dominate the cost of system design. On the
                 other hand, technology scaling leads to burn-in phase
                 out. As a result, in-the-field error rate may increase
                 due to both actual errors and latent defects. Whereas
                 data can be protected with arithmetic codes (like
                 parity or ECC), there is a lack of cost-effective
                 mechanisms for control logic.\par

                 This paper presents a light-weight microarchitectural
                 mechanism that ensures that data consumed through
                 registers are correct. Microarchitecture presents a new
                 way to manage reliability and testing without
                 significantly sacrificing cost and performance,
                 offering a unique opportunity to detect errors in the
                 field at low cost. Our results show a coverage around
                 90\% for the targeted structures with a cost in power
                 and area of about 4\%. The structures protected include
                 the issue queue logic and the data associated (i.e.,
                 tags, control signals), input multiplexors, rename
                 data, replay logic, register free list, bypasses data
                 and logic, MOB data and addresses, register file logic,
                 register file storage and functional units.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "control logic; degradation; design errors; end-to-end
                 protection; online testing",
}

@Article{Yoon:2009:MME,
  author =       "Doe Hyun Yoon and Mattan Erez",
  title =        "Memory mapped {ECC}: low-cost error protection for
                 last level caches",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "37",
  number =       "3",
  pages =        "116--127",
  month =        jun,
  year =         "2009",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1555754.1555771",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Aug 11 18:12:55 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "This paper presents a novel technique, Memory Mapped
                 ECC, which reduces the cost of providing error
                 correction for SRAM caches. It is important to limit
                 such overheads as processor resources become
                 constrained and error propensity increases. The
                 continuing decrease in SRAM cell size and the growing
                 capacity of caches increases the likelihood of errors
                 in SRAM arrays. To address this, redundant information
                 can be used to correct a value after an error occurs.
                 Information redundancy is typically provided through
                 error-correcting codes (ECC), which append bits to
                 every SRAM row and increase the array's area and energy
                 consumption. We make three observations regarding error
                 protection and utilize them in our architecture: (1)
                 much of the data in a cache is replicated throughout
                 the hierarchy and is inherently redundant; (2)
                 error-detection is necessary for every cache access and
                 is cheaper than error correction, which is very
                 infrequent; (3) redundant information for correction
                 need not be stored in high-cost SRAM. Our unique
                 architecture only dedicates SRAM for error detection
                 while the ECC bits are stored within the memory
                 hierarchy as data. We associate a physical memory
                 address with each cache line for ECC storage and rely
                 on locality to minimize the impact. The cache is
                 dynamically and transparently partitioned between data
                 and ECC with the fraction of ECC growing with the
                 number of dirty cache lines. We show that this has
                 little impact on both performance (1.3\% average and <
                 4\%) and memory traffic (3\%) across a range of
                 memory-intensive applications.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "error correction; last-level caches; reliability; soft
                 error",
}

@Article{Woh:2009:AAA,
  author =       "Mark Woh and Sangwon Seo and Scott Mahlke and Trevor
                 Mudge and Chaitali Chakrabarti and Krisztian Flautner",
  title =        "{AnySP}: anytime anywhere anyway signal processing",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "37",
  number =       "3",
  pages =        "128--139",
  month =        jun,
  year =         "2009",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1555754.1555773",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Aug 11 18:12:55 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "In the past decade, the proliferation of mobile
                 devices has increased at a spectacular rate. There are
                 now more than 3.3 billion active cell phones in the
                 world---a device that we now all depend on in our daily
                 lives. The current generation of devices employs a
                 combination of general-purpose processors, digital
                 signal processors, and hardwired accelerators to
                 provide giga-operations-per-second performance on
                 milliwatt power budgets. Such heterogeneous
                 organizations are inefficient to build and maintain, as
                 well as waste silicon area and power. Looking forward
                 to the next generation of mobile computing, computation
                 requirements will increase by one to three orders of
                 magnitude due to higher data rates, increased
                 complexity algorithms, and greater computation
                 diversity but the power requirements will be just as
                 stringent. Scaling of existing approaches will not
                 suffice instead the inherent computational efficiency,
                 programmability, and adaptability of the hardware must
                 change. To overcome these challenges, this paper
                 proposes an example architecture, referred to as AnySP,
                 for the next generation mobile signal processing. AnySP
                 uses a co-design approach where the next generation
                 wireless signal processing and high-definition video
                 algorithms are analyzed to create a domain specific
                 programmable architecture. At the heart of AnySP is a
                 configurable single-instruction multiple-data datapath
                 that is capable of processing wide vectors or multiple
                 narrow vectors simultaneously. In addition, deeper
                 computation subgraphs can be pipelined across the
                 single-instruction multiple-data lanes. These three
                 operating modes provide high throughput across varying
                 application types. Results show that AnySP is capable
                 of sustaining 4G wireless processing and
                 high-definition video throughput rates, and will
                 approach the 1000 Mops/mW efficiency barrier when
                 scaled to 45nm.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "fully programmable architecture; high-end signal
                 processing; low-power architecture; SIMD;
                 single-instruction multiple-data parallelism; software
                 defined radio",
}

@Article{Kelm:2009:RAS,
  author =       "John H. Kelm and Daniel R. Johnson and Matthew R.
                 Johnson and Neal C. Crago and William Tuohy and Aqeel
                 Mahesri and Steven S. Lumetta and Matthew I. Frank and
                 Sanjay J. Patel",
  title =        "{Rigel}: an architecture and scalable programming
                 interface for a 1000-core accelerator",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "37",
  number =       "3",
  pages =        "140--151",
  month =        jun,
  year =         "2009",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1555754.1555774",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Aug 11 18:12:55 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "This paper considers Rigel, a programmable accelerator
                 architecture for a broad class of data- and
                 task-parallel computation. Rigel comprises 1000+
                 hierarchically-organized cores that use a fine-grained,
                 dynamically scheduled single-program, multiple-data
                 (SPMD) execution model. Rigel's low-level programming
                 interface adopts a single global address space model
                 where parallel work is expressed in a task-centric,
                 bulk-synchronized manner using minimal hardware
                 support. Compared to existing accelerators, which
                 contain domain-specific hardware, specialized memories,
                 and/or restrictive programming models, Rigel is more
                 flexible and provides a straightforward target for a
                 broader set of applications.\par

                 We perform a design analysis of Rigel to quantify the
                 compute density and power efficiency of our initial
                 design. We find that Rigel can achieve a density of
                 over 8 single-precision GFLOPS/mm$^2$ in 45nm, which is
                 comparable to high-end GPUs scaled to 45nm. We perform
                 experimental analysis on several applications ported to
                 the Rigel low-level programming interface. We examine
                 scalability issues related to work distribution,
                 synchronization, and load-balancing for 1000-core
                 accelerators using software techniques and minimal
                 specialized hardware support. We find that while it is
                 important to support fast task distribution and barrier
                 operations, these operations can be implemented without
                 specialized hardware using flexible hardware
                 primitives.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "accelerator; computer architecture; low-level
                 programming interface",
}

@Article{Hong:2009:AMG,
  author =       "Sunpyo Hong and Hyesoon Kim",
  title =        "An analytical model for a {GPU} architecture with
                 memory-level and thread-level parallelism awareness",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "37",
  number =       "3",
  pages =        "152--163",
  month =        jun,
  year =         "2009",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1555815.1555775",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Aug 11 18:12:55 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "GPU architectures are increasingly important in the
                 multi-core era due to their high number of parallel
                 processors. Programming thousands of massively parallel
                 threads is a big challenge for software engineers, but
                 understanding the performance bottlenecks of those
                 parallel programs on GPU architectures to improve
                 application performance is even more difficult. Current
                 approaches rely on programmers to tune their
                 applications by exploiting the design space
                 exhaustively without fully understanding the
                 performance characteristics of their
                 applications.\par

                 To provide insights into the performance bottlenecks of
                 parallel applications on GPU architectures, we propose
                 a simple analytical model that estimates the execution
                 time of massively parallel programs. The key component
                 of our model is estimating the number of parallel
                 memory requests (we call this the memory warp
                 parallelism) by considering the number of running
                 threads and memory bandwidth. Based on the degree of
                 memory warp parallelism, the model estimates the cost
                 of memory requests, thereby estimating the overall
                 execution time of a program. Comparisons between the
                 outcome of the model and the actual execution time in
                 several GPUs show that the geometric mean of absolute
                 error of our model on micro-benchmarks is 5.4\% and on
                 GPU computing applications is 13.3\%. All the
                 applications are written in the CUDA programming
                 language.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "analytical model; CUDA; GPU architecture; memory level
                 parallelism; performance estimation; warp level
                 parallelism",
}

@Article{Biswas:2009:MEM,
  author =       "Susmit Biswas and Diana Franklin and Alan Savage and
                 Ryan Dixon and Timothy Sherwood and Frederic T. Chong",
  title =        "Multi-execution: multicore caching for data-similar
                 executions",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "37",
  number =       "3",
  pages =        "164--173",
  month =        jun,
  year =         "2009",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1555815.1555777",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Aug 11 18:12:55 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "While microprocessor designers turn to multicore
                 architectures to sustain performance expectations, the
                 dramatic increase in parallelism of such architectures
                 will put substantial demands on off-chip bandwidth and
                 make the memory wall more significant than ever. This
                 paper demonstrates that one profitable application of
                 multicore processors is the execution of many similar
                 instantiations of the same program. We identify that
                 this model of execution is used in several practical
                 scenarios and term it as 'multi-execution.' Often, each
                 such instance utilizes very similar data. In
                 conventional cache hierarchies, each instance would
                 cache its own data independently. We propose the
                 Mergeable cache architecture that detects data
                 similarities and merges cache blocks, resulting in
                 substantial savings in cache storage requirements. This
                 leads to reductions in off-chip memory accesses and
                 overall power usage, and increases in application
                 performance. We present cycle-accurate simulation
                 results of 8 benchmarks (6 from SPEC2000) to
                 demonstrate that our technique provides a scalable
                 solution and leads to significant speedups due to
                 reductions in main memory accesses. For 8 cores running
                 8 similar executions of the same application and
                 sharing an exclusive 4-MB, 8-way L2 cache, the
                 Mergeable cache shows a speedup in execution by 2.5x on
                 average (ranging from 0.93x to 6.92x), while posing an
                 overhead of only 4.28\% on cache area and 5.21\% on
                 power when it is used.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "CMP; data similar execution; multicore cache design",
}

@Article{Xie:2009:PPI,
  author =       "Yuejian Xie and Gabriel H. Loh",
  title =        "{PIPP}: promotion\slash insertion pseudo-partitioning
                 of multi-core shared caches",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "37",
  number =       "3",
  pages =        "174--183",
  month =        jun,
  year =         "2009",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1555815.1555778",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Aug 11 18:12:55 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Many multi-core processors employ a large last-level
                 cache (LLC) shared among the multiple cores. Past
                 research has demonstrated that sharing-oblivious cache
                 management policies (e.g., LRU) can lead to poor
                 performance and fairness when the multiple cores
                 compete for the limited LLC capacity. Different memory
                 access patterns can cause cache contention in different
                 ways, and various techniques have been proposed to
                 target some of these behaviors. In this work, we
                 propose a new cache management approach that combines
                 dynamic insertion and promotion policies to provide the
                 benefits of cache partitioning, adaptive insertion, and
                 capacity stealing all with a single mechanism. By
                 handling multiple types of memory behaviors, our
                 proposed technique outperforms techniques that target
                 only either capacity partitioning or adaptive
                 insertion.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "cache; contention; insertion; multi-core; promotion;
                 sharing",
}

@Article{Hardavellas:2009:RNN,
  author =       "Nikos Hardavellas and Michael Ferdman and Babak
                 Falsafi and Anastasia Ailamaki",
  title =        "{Reactive NUCA}: near-optimal block placement and
                 replication in distributed caches",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "37",
  number =       "3",
  pages =        "184--195",
  month =        jun,
  year =         "2009",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1555754.1555779",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Aug 11 18:12:55 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Increases in on-chip communication delay and the large
                 working sets of server and scientific workloads
                 complicate the design of the on-chip last-level cache
                 for multicore processors. The large working sets favor
                 a shared cache design that maximizes the aggregate
                 cache capacity and minimizes off-chip memory requests.
                 At the same time, the growing on-chip communication
                 delay favors core-private caches that replicate data to
                 minimize delays on global wires. Recent hybrid
                 proposals offer lower average latency than conventional
                 designs, but they address the placement requirements of
                 only a subset of the data accessed by the application,
                 require complex lookup and coherence mechanisms that
                 increase latency, or fail to scale to high core
                 counts.\par

                 In this work, we observe that the cache access patterns
                 of a range of server and scientific workloads can be
                 classified into distinct classes, where each class is
                 amenable to different block placement policies. Based
                 on this observation, we propose Reactive NUCA (R-NUCA),
                 a distributed cache design which reacts to the class of
                 each cache access and places blocks at the appropriate
                 location in the cache. R-NUCA cooperates with the
                 operating system to support intelligent placement,
                 migration, and replication without the overhead of an
                 explicit coherence mechanism for the on-chip last-level
                 cache. In a range of server, scientific, and
                 multiprogrammed workloads, R-NUCA matches the
                 performance of the best cache design for each workload,
                 improving performance by 14\% on average over competing
                 designs and by 32\% at best, while achieving
                 performance within 5\% of an ideal cache design.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "block migration; block placement; block replication;
                 cache; cache coherence; cache indexing; cache lookup;
                 cache management; chip multiprocessor; cmp; coherence;
                 data migration; data placement; data replication;
                 interleaving; last-level cache; lookup; migration;
                 multi-core; multicore; non-uniform cache access; NUCA;
                 placement; private cache; R-NUCA; Reactive NUCA;
                 replication; rotational interleaving; shared cache",
}

@Article{Moscibroda:2009:CBR,
  author =       "Thomas Moscibroda and Onur Mutlu",
  title =        "A case for bufferless routing in on-chip networks",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "37",
  number =       "3",
  pages =        "196--207",
  month =        jun,
  year =         "2009",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1555815.1555781",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Aug 11 18:12:55 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Buffers in on-chip networks consume significant
                 energy, occupy chip area, and increase design
                 complexity. In this paper, we make a case for a new
                 approach to designing on-chip interconnection networks
                 that eliminates the need for buffers for routing or
                 flow control. We describe new algorithms for routing
                 without using buffers in router input/output ports. We
                 analyze the advantages and disadvantages of bufferless
                 routing and discuss how router latency can be reduced
                 by taking advantage of the fact that input/output
                 buffers do not exist. Our evaluations show that routing
                 without buffers significantly reduces the energy
                 consumption of the on-chip cache/processor-to-cache
                 network, while providing similar performance to that of
                 existing buffered routing algorithms at low network
                 utilization (i.e., on most real applications). We
                 conclude that bufferless routing can be an attractive
                 and energy-efficient design option for on-chip
                 cache/processor-to-cache networks where network
                 utilization is low.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "memory systems; multi-core; on-chip networks;
                 routing",
}

@Article{Kinsy:2009:AAD,
  author =       "Michel A. Kinsy and Myong Hyon Cho and Tina Wen and
                 Edward Suh and Marten van Dijk and Srinivas Devadas",
  title =        "Application-aware deadlock-free oblivious routing",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "37",
  number =       "3",
  pages =        "208--219",
  month =        jun,
  year =         "2009",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1555815.1555782",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Aug 11 18:12:55 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Conventional oblivious routing algorithms are either
                 not application-aware or assume that each flow has its
                 own private channel to ensure deadlock avoidance. We
                 present a framework for application-aware routing that
                 assures deadlock-freedom under one or more channels by
                 forcing routes to conform to an acyclic channel
                 dependence graph. Arbitrary minimal routes can be made
                 deadlock-free through appropriate static channel
                 allocation when two or more channels are available.
                 Given bandwidth estimates for flows, we present a mixed
                 integer-linear programming (MILP) approach and a
                 heuristic approach for producing deadlock-free routes
                 that minimize maximum channel load. The heuristic
                 algorithm is calibrated using the MILP algorithm and
                 evaluated on a number of benchmarks through detailed
                 network simulation. Our framework can be used to
                 produce application-aware routes that target the
                 minimization of latency, number of flows through a
                 link, bandwidth, or any combination thereof.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "oblivious routing; on-chip interconnection networks;
                 systems-on-chip",
}

@Article{Jiang:2009:IAR,
  author =       "Nan Jiang and John Kim and William J. Dally",
  title =        "Indirect adaptive routing on large scale
                 interconnection networks",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "37",
  number =       "3",
  pages =        "220--231",
  month =        jun,
  year =         "2009",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1555754.1555783",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Aug 11 18:12:55 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Recently proposed high-radix interconnection networks
                 [10] require global adaptive routing to achieve optimum
                 performance. Existing direct adaptive routing methods
                 are slow to sense congestion remote from the source
                 router and hence misroute many packets before such
                 congestion is detected. This paper introduces indirect
                 global adaptive routing (IAR) in which the adaptive
                 routing decision uses information that is not directly
                 available at the source router. We describe four IAR
                 routing methods: credit round trip (CRT) [10],
                 progressive adaptive routing (PAR), piggyback routing
                 (PB), and reservation routing (RES). We evaluate each
                 of these methods on the dragonfly topology under both
                 steady-state and transient loads. Our results show that
                 PB, PAR, and CRT all achieve good performance. PB
                 provides the best absolute performance, with 2-7\%
                 lower latency on steady-state uniform random traffic at
                 70\% load, while PAR provides the fastest response on
                 transient loads. We also evaluate the implementation
                 costs of the indirect adaptive routing methods and show
                 that PB has the lowest implementation cost requiring",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "dragonfly; interconnection networks; routing",
}

@Article{Hamilton:2009:ISS,
  author =       "James Hamilton",
  title =        "{Internet}-scale service infrastructure efficiency",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "37",
  number =       "3",
  pages =        "232--232",
  month =        jun,
  year =         "2009",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1555815.1555756",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Aug 11 18:12:55 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "High-scale cloud services provide economies of scale
                 of five to ten over small-scale deployments, and are
                 becoming a large part of both enterprise information
                 processing and consumer services. Even very large
                 enterprise IT deployments have quite different cost
                 drivers and optimizations points from internet-scale
                 services. The former are people-dominated from a cost
                 perspective whereas internet-scale service costs are
                 driven by server hardware and infrastructure with
                 people costs fading into the noise at less than
                 10\%.\par

                 In this talk we inventory where the infrastructure
                 costs are in internet-scale services. We track power
                 distribution from 115KV at the property line through
                 all conversions into the data center tracking the
                 losses to final delivery at semiconductor voltage
                 levels. We track cooling and all the energy conversions
                 from power dissipation through release to the
                 environment outside of the building. Understanding
                 where the costs and inefficiencies lie, we'll look more
                 closely at cooling and overall mechanical system
                 design, server hardware design, and software techniques
                 including graceful degradation mode, power yield
                 management, and resource consumption shaping.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "efficiency; Internet-scale",
}

@Article{Blundell:2009:IPT,
  author =       "Colin Blundell and Milo M. K. Martin and Thomas F.
                 Wenisch",
  title =        "{InvisiFence}: performance-transparent memory ordering
                 in conventional multiprocessors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "37",
  number =       "3",
  pages =        "233--244",
  month =        jun,
  year =         "2009",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1555754.1555785",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Aug 11 18:12:55 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "A multiprocessor's memory consistency model imposes
                 ordering constraints among loads, stores, atomic
                 operations, and memory fences. Even for consistency
                 models that relax ordering among loads and stores,
                 ordering constraints still induce significant
                 performance penalties due to atomic operations and
                 memory ordering fences. Several prior proposals reduce
                 the performance penalty of strongly ordered models
                 using post-retirement speculation, but these designs
                 either (1) maintain speculative state at a per-store
                 granularity, causing storage requirements to grow
                 proportionally to speculation depth, or (2) employ
                 distributed global commit arbitration using
                 unconventional chunk-based invalidation mechanisms. In
                 this paper we propose InvisiFence, an approach for
                 implementing memory ordering based on post-retirement
                 speculation that avoids these concerns. InvisiFence
                 leverages minimalistic mechanisms for post-retirement
                 speculation proposed in other contexts to (1) track
                 speculative state efficiently at block-granularity with
                 dedicated storage requirements independent of
                 speculation depth, (2) provide fast commit by avoiding
                 explicit commit arbitration, and (3) operate under a
                 conventional invalidation-based cache coherence
                 protocol. InvisiFence supports both modes of operation
                 found in prior work: speculating only when necessary to
                 minimize the risk of rollback-inducing violations or
                 speculating continuously to decouple consistency
                 enforcement from the processor core. Overall,
                 InvisiFence requires approximately one kilobyte of
                 additional state to transform a conventional
                 multiprocessor into one that provides
                 performance-transparent memory ordering, fences, and
                 atomic operations.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "memory consistency; parallel programming",
}

@Article{Hilton:2009:DSC,
  author =       "Andrew Hilton and Amir Roth",
  title =        "Decoupled store completion\slash silent deterministic
                 replay: enabling scalable data memory for {CPR\slash
                 CFP} processors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "37",
  number =       "3",
  pages =        "245--254",
  month =        jun,
  year =         "2009",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1555815.1555786",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Aug 11 18:12:55 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "CPR/CFP (Checkpoint Processing and Recovery/Continual
                 Flow Pipeline) support an adaptive instruction window
                 that scales to tolerate last-level cache misses.
                 CPR/CFP scale the register file by aggressively
                 reclaiming the destination registers of many in-flight
                 instructions. However, an analogous mechanism does not
                 exist for stores and loads. As the window expands,
                 CPR/CFP processors must track all in-flight stores and
                 loads to support forwarding and detect memory ordering
                 violations.\par

                 The previously-described SVW (Store Vulnerability
                 Window) and SQIP (Store Queue Index Prediction) schemes
                 provide scalable, non-associative load and store
                 queues, respectively. However, they don't work smoothly
                 in a CPR/CFP context. SVW/SQIP rely on the ability to
                 dynamically stall some loads until a specific older
                 store writes to the cache. Enforcing this serialization
                 in CPR/CFP is expensive if the load and store are in
                 the same checkpoint.\par

                 We introduce two complementary procedures that
                 implement this serialization efficiently. Decoupled
                 Store Completion (DSC) allows stores to write to the
                 cache before the enclosing checkpoint completes
                 execution. Silent Deterministic Replay (SDR) supports
                 mis-speculation recovery in the presence of DSC by
                 replaying loads older than completed stores using
                 values from the load queue. The combination of DSC and
                 SDR enables an SVW/SQIP based CPR/CFP memory system
                 that outperforms previous designs while occupying less
                 area.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "checkpoint processors; load-store queues",
}

@Article{Zheng:2009:DDB,
  author =       "Hongzhong Zheng and Jiang Lin and Zhao Zhang and
                 Zhichun Zhu",
  title =        "Decoupled {DIMM}: building high-bandwidth memory
                 system using low-speed {DRAM} devices",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "37",
  number =       "3",
  pages =        "255--266",
  month =        jun,
  year =         "2009",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1555754.1555788",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Aug 11 18:12:55 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "The widespread use of multicore processors has
                 dramatically increased the demands on high bandwidth
                 and large capacity from memory systems. In a
                 conventional DDR2/DDR3 DRAM memory system, the memory
                 bus and DRAM devices run at the same data rate. To
                 improve memory bandwidth, we propose a new memory
                 system design called decoupled DIMM that allows the
                 memory bus to operate at a data rate much higher than
                 that of the DRAM devices. In the design, a
                 synchronization buffer is added to relay data between
                 the slow DRAM devices and the fast memory bus; and
                 memory access scheduling is revised to avoid access
                 conflicts on memory ranks. The design not only improves
                 memory bandwidth beyond what can be supported by
                 current memory devices, but also improves reliability,
                 power efficiency, and cost effectiveness by using
                 relatively slow memory devices. The idea of decoupling,
                 precisely the decoupling of bandwidth match between
                 memory bus and a single rank of devices, can also be
                 applied to other types of memory systems including
                 FB-DIMM.\par

                 Our experimental results show that a decoupled DIMM
                 system of 2667MT/s bus data rate and 1333MT/s device
                 data rate improves the performance of memory-intensive
                 workloads by 51\% on average over a conventional memory
                 system of 1333MT/s data rate. Alternatively, a
                 decoupled DIMM system of 1600MT/s bus data rate and
                 800MT/s device data rate incurs only 8\% performance
                 loss when compared with a conventional system of
                 1600MT/s data rate, with 16\% reduction on the memory
                 power consumption and 9\% saving on memory energy.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "bandwidth decoupling; decoupled DIMM; DRAM memories",
}

@Article{Lim:2009:DME,
  author =       "Kevin Lim and Jichuan Chang and Trevor Mudge and
                 Parthasarathy Ranganathan and Steven K. Reinhardt and
                 Thomas F. Wenisch",
  title =        "Disaggregated memory for expansion and sharing in
                 blade servers",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "37",
  number =       "3",
  pages =        "267--278",
  month =        jun,
  year =         "2009",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1555815.1555789",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Aug 11 18:12:55 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Analysis of technology and application trends reveals
                 a growing imbalance in the peak
                 compute-to-memory-capacity ratio for future servers. At
                 the same time, the fraction contributed by memory
                 systems to total datacenter costs and power consumption
                 during typical usage is increasing. In response to
                 these trends, this paper re-examines traditional
                 compute-memory co-location on a single system and
                 details the design of a new general-purpose
                 architectural building block-a memory blade-that allows
                 memory to be 'disaggregated' across a system ensemble.
                 This remote memory blade can be used for memory
                 capacity expansion to improve performance and for
                 sharing memory across servers to reduce provisioning
                 and power costs. We use this memory blade building
                 block to propose two new system architecture
                 solutions-(1) page-swapped remote memory at the
                 virtualization layer, and (2) block-access remote
                 memory with support in the coherence hardware-that
                 enable transparent memory expansion and sharing on
                 commodity-based systems. Using simulations of a mix of
                 enterprise benchmarks supplemented with traces from
                 live datacenters, we demonstrate that memory
                 disaggregation can provide substantial performance
                 benefits (on average 10X) in memory constrained
                 environments, while the sharing enabled by our
                 solutions can improve performance-per-dollar by up to
                 57\% when optimizing memory provisioning across
                 multiple servers.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "disaggregated memory; memory blades; memory capacity
                 expansion; power and cost efficiencies",
}

@Article{Dirik:2009:PPS,
  author =       "Cagdas Dirik and Bruce Jacob",
  title =        "The performance of {PC} solid-state disks {(SSDs)} as
                 a function of bandwidth, concurrency, device
                 architecture, and system organization",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "37",
  number =       "3",
  pages =        "279--289",
  month =        jun,
  year =         "2009",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1555754.1555790",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Aug 11 18:12:55 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "As their prices decline, their storage capacities
                 increase, and their endurance improves, NAND Flash
                 Solid State Disks (SSD) provide an increasingly
                 attractive alternative to Hard Disk Drives (HDD) for
                 portable computing systems and PCs. This paper presents
                 a study of NAND Flash SSD architectures and their
                 management techniques, quantifying SSD performance
                 under user-driven/PC applications in a multi-tasked
                 environment; user activity represents typical PC
                 workloads and includes browsing files and folders,
                 emailing, text editing and document creation, surfing
                 the web, listening to music and playing movies, editing
                 large pictures, and running office applications.\par

                 We find the following: (a) the real limitation to NAND
                 Flash memory performance is not its low per-device
                 bandwidth but its internal core interface; (b) NAND
                 Flash memory media transfer rates do not need to scale
                 up to those of HDDs for good performance; (c) SSD
                 organizations that exploit concurrency at both the
                 system and device level (e.g. RAID-like organizations
                 and Micron-style (superblocks) improve performance
                 significantly); and (d) these system- and device-level
                 concurrency mechanisms are, to a significant degree,
                 orthogonal: that is, the performance increase due to
                 one does not come at the expense of the other, as each
                 exploits a different facet of concurrency exhibited
                 within the PC workload.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "flash memory; performance; solid state disks; storage
                 systems",
}

@Article{Bhattacharjee:2009:TCP,
  author =       "Abhishek Bhattacharjee and Margaret Martonosi",
  title =        "Thread criticality predictors for dynamic performance,
                 power, and resource management in chip
                 multiprocessors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "37",
  number =       "3",
  pages =        "290--301",
  month =        jun,
  year =         "2009",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1555754.1555792",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Aug 11 18:12:55 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "With the shift towards chip multiprocessors (CMPs),
                 exploiting and managing parallelism has become a
                 central problem in computing systems. Many issues of
                 parallelism management boil down to discerning which
                 running threads or processes are critical, or slowest,
                 versus which are non-critical. If one can accurately
                 predict critical threads in a parallel program, then
                 one can respond in a variety of ways. Possibilities
                 include running the critical thread at a faster clock
                 rate, performing load balancing techniques to offload
                 work onto currently non-critical threads, or giving the
                 critical thread more on-chip resources to execute
                 faster.\par

                 This paper proposes and evaluates simple but effective
                 thread criticality predictors for parallel
                 applications. We show that accurate predictors can be
                 built using counters that are typically already
                 available on-chip. Our predictor, based on memory
                 hierarchy statistics, identifies thread criticality
                 with an average accuracy of 93\% across a range of
                 architectures.\par

                 We also demonstrate two applications of our predictor.
                 First, we show how Intel's Threading Building Blocks
                 (TBB) parallel runtime system can benefit from task
                 stealing techniques that use our criticality predictor
                 to reduce load imbalance. Using criticality prediction
                 to guide TBB's task-stealing decisions improves
                 performance by 13-32\% for TBB-based PARSEC benchmarks
                 running on a 32-core CMP. As a second application,
                 criticality prediction guides dynamic energy
                 optimizations in barrier-based applications. By running
                 the predicted critical thread at the full clock rate
                 and frequency-scaling non-critical threads, this
                 approach achieves average energy savings of 15\% while
                 negligibly degrading performance for SPLASH-2 and
                 PARSEC benchmarks.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "caches; DVFS; Intel TBB; parallel processing; thread
                 criticality prediction",
}

@Article{Rangan:2009:TMF,
  author =       "Krishna K. Rangan and Gu-Yeon Wei and David Brooks",
  title =        "Thread motion: fine-grained power management for
                 multi-core systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "37",
  number =       "3",
  pages =        "302--313",
  month =        jun,
  year =         "2009",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1555815.1555793",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Aug 11 18:12:55 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Dynamic voltage and frequency scaling (DVFS) is a
                 commonly-used power-management scheme that dynamically
                 adjusts power and performance to the time-varying needs
                 of running programs. Unfortunately, conventional DVFS,
                 relying on off-chip regulators, faces limitations in
                 terms of temporal granularity and high costs when
                 considered for future multi-core systems. To overcome
                 these challenges, this paper presents thread motion
                 (TM), a fine-grained power-management scheme for chip
                 multiprocessors (CMPs). Instead of incurring the high
                 cost of changing the voltage and frequency of different
                 cores, TM enables rapid movement of threads to adapt
                 the time-varying computing needs of running
                 applications to a mixture of cores with fixed but
                 different power/performance levels. Results show that
                 for the same power budget, two voltage/frequency levels
                 are sufficient to provide performance gains
                 commensurate to idealized scenarios using per-core
                 voltage control. Thread motion extends workload-based
                 power management into the nanosecond realm and, for a
                 given power budget, provides up to 20\% better
                 performance than coarse-grained DVFS.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "DVFS; multi-core power management; thread motion",
}

@Article{Wang:2009:TCP,
  author =       "Yefu Wang and Kai Ma and Xiaorui Wang",
  title =        "Temperature-constrained power control for chip
                 multiprocessors with online model estimation",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "37",
  number =       "3",
  pages =        "314--324",
  month =        jun,
  year =         "2009",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1555815.1555794",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Aug 11 18:12:55 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "As chip multiprocessors (CMP) become the main trend in
                 processor development, various power and thermal
                 management strategies have recently been proposed to
                 optimize system performance while controlling the power
                 or temperature of a CMP chip to stay below a
                 constraint. The availability of per-core DVFS (dynamic
                 voltage and frequency scaling) also makes it possible
                 to develop advanced management strategies. However,
                 most existing solutions rely on open-loop search or
                 optimization with the assumption that power can be
                 estimated accurately, while others adopt oversimplified
                 feedback control strategies to control power and
                 temperature separately, without any theoretical
                 guarantees. In this paper, we propose a chip-level
                 power control algorithm that is systematically designed
                 based on optimal control theory. Our algorithm can
                 precisely control the power of a CMP chip to the
                 desired set point while maintaining the temperature of
                 each core below a specified threshold. Furthermore, an
                 online model estimator is designed to achieve
                 analytical assurance of control accuracy and system
                 stability, even in the face of significant workload
                 variations or unpredictable chip or core variations.
                 Empirical results on a physical testbed show that our
                 controller outperforms two state-of-the-art control
                 algorithms by having better SPEC benchmark performance
                 and more precise power control. In addition, extensive
                 simulation results demonstrate the efficacy of our
                 algorithm for various CMP configurations.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "chip multiprocessor; feedback control; power
                 management",
}

@Article{Yu:2009:CIC,
  author =       "Jie Yu and Satish Narayanasamy",
  title =        "A case for an interleaving constrained shared-memory
                 multi-processor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "37",
  number =       "3",
  pages =        "325--336",
  month =        jun,
  year =         "2009",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1555815.1555796",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Aug 11 18:12:55 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Shared-memory multi-threaded programming is inherently
                 more difficult than single-threaded programming. The
                 main source of complexity is that, the threads of an
                 application can interleave in so many different ways.
                 To ensure correctness, a programmer has to test all
                 possible thread interleavings, which, however, is
                 impractical.\par

                 Many rare thread interleavings remain untested in
                 production systems, and they are the root cause for a
                 majority of concurrency bugs. We propose a
                 shared-memory multi-processor design that avoids
                 untested interleavings to improve the correctness of a
                 multi-threaded program. Since untested interleavings
                 tend to occur infrequently at runtime, the performance
                 cost of avoiding them is not high.\par

                 We propose to encode the set of tested correct
                 interleavings in a program's binary executable using
                 {\em Predecessor Set (PSet)\/} constraints. These
                 constraints are efficiently enforced at runtime using
                 processor support, which ensures that the runtime
                 follows a tested interleaving. We analyze several bugs
                 in open source applications such as MySQL, Apache,
                 Mozilla, etc., and show that, by enforcing PSet
                 constraints, we can avoid not only data races and
                 atomicity violations, but also other forms of
                 concurrency bugs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "concurrency bugs; multiprocessors; parallel
                 programming; software reliability",
}

@Article{Muzahid:2009:SSB,
  author =       "Abdullah Muzahid and Dario Su{\'a}rez and Shanxiang Qi
                 and Josep Torrellas",
  title =        "{SigRace}: signature-based data race detection",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "37",
  number =       "3",
  pages =        "337--348",
  month =        jun,
  year =         "2009",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1555815.1555797",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Aug 11 18:12:55 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Detecting data races in parallel programs is important
                 for both software development and production-run
                 diagnosis. Recently, there have been several proposals
                 for hardware-assisted data race detection. Such
                 proposals typically modify the L1 cache and cache
                 coherence protocol messages, and largely lose their
                 capability when lines get displaced or invalidated from
                 the cache. To eliminate these shortcomings, this paper
                 proposes a novel, different approach to
                 hardware-assisted data race detection. The approach,
                 called SigRace, relies on hardware address signatures.
                 As a processor runs, the addresses of the data that it
                 accesses are automatically encoded in signatures. At
                 certain times, the signatures are automatically passed
                 to a hardware module that intersects them with those of
                 other processors. If the intersection is not null, a
                 data race may have occurred.\par

                 This paper presents the architecture of SigRace, an
                 implementation, and its software interface. With
                 SigRace, caches and coherence protocol messages are
                 unmodified. Moreover, cache lines can be displaced and
                 invalidated with no effect. Our experiments show that
                 SigRace is significantly more effective than a
                 state-of-the-art conventional hardware-assisted race
                 detector. SigRace finds on average 29\% more static
                 races and 107\% more dynamic races. Moreover, if we
                 inject data races, SigRace finds 150\% more static
                 races than the conventional scheme.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "concurrency defect; data race; happened-before;
                 signature; SigRace; timestamp",
}

@Article{Nagarajan:2009:EEC,
  author =       "Vijay Nagarajan and Rajiv Gupta",
  title =        "{ECMon}: exposing cache events for monitoring",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "37",
  number =       "3",
  pages =        "349--360",
  month =        jun,
  year =         "2009",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1555754.1555798",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Aug 11 18:12:55 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "The advent of multicores has introduced new challenges
                 for programmers to provide increased performance and
                 software reliability. There has been significant
                 interest in techniques that use software speculation to
                 better utilize the computational power of multicores.
                 At the same time, several recent proposals for ensuring
                 software reliability are not applicable in a multicore
                 setting due to their inability to handle interprocessor
                 shared memory dependences (ISMDs). The demands for
                 performing speculation and ensuring software
                 reliability in a multicore setting, although seemingly
                 different, share a common requirement: the need for
                 monitoring program execution and collecting
                 interprocessor dependence information at low overhead.
                 For example, an important component of speculation is
                 the efficient detection of misspeculation which in turn
                 requires dependence information. Likewise, tasks that
                 help ensure software reliability on multicores,
                 including {\em recording for replay}, require ISMD
                 information.\par

                 In this paper, we propose {\em ECMon:\/} support for
                 exposing cache events to the software. This enables the
                 programmer to catch these events and react to them; in
                 effect, efficiently exposing the ISMDs to the
                 programmer. In the context of speculation, we show how
                 {\em ECMon\/} optimizes the detection of
                 miss-speculation; we use this simple support to
                 speculate past active barriers and achieve a speedup of
                 12\% for the set of parallel programs considered. As an
                 application of ensuring software reliability, we show
                 how {\em ECMon\/} can be used to record shared memory
                 dependences on multicores using no specialized hardware
                 support at only 2.8 fold execution time overhead.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "cache events; recording for replay; speculation past
                 barriers",
}

@Article{Saidi:2009:EEP,
  author =       "Ali G. Saidi and Nathan L. Binkert and Steven K.
                 Reinhardt and Trevor Mudge",
  title =        "End-to-end performance forecasting: finding
                 bottlenecks before they happen",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "37",
  number =       "3",
  pages =        "361--370",
  month =        jun,
  year =         "2009",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1555754.1555800",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Aug 11 18:12:55 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Many important workloads today, such as web-hosted
                 services, are limited not by processor core performance
                 but by interactions among the cores, the memory system,
                 I/O devices, and the complex software layers that tie
                 these components together. Architects designing future
                 systems for these workloads are challenged to identify
                 performance bottlenecks because, as in any concurrent
                 system, overheads in one component may be hidden due to
                 overlap with other operations. These overlaps span the
                 user/kernel and software/hardware boundaries, making
                 traditional performance analysis techniques
                 inadequate.\par

                 We present a methodology for identifying end-to-end
                 critical paths across software and simulated hardware
                 in complex networked systems. By modeling systems as
                 collections of state machines interacting via queues,
                 we can trace critical paths through multiplexed
                 processing engines, identify when resources create
                 bottlenecks (including abstract resources such as
                 flow-control credits), and predict the benefit of
                 eliminating bottlenecks by increasing hardware speeds
                 or expanding available resources.\par

                 We implement our technique in a full-system simulator
                 and analyze a TCP microbenchmark, a web server, the
                 Linux TCP/IP stack, and an Ethernet controller. From a
                 single run of the microbenchmark, our tool--within
                 minutes--correctly identifies a series of bottlenecks,
                 and predicts the performance of hypothetical systems in
                 which these bottlenecks are successively eliminated,
                 culminating in a total speedup of 3X. We then validate
                 these predictions through hours of additional
                 simulation, and find them to be accurate within
                 1--17\%. We also analyze the web server, find it to be
                 CPU-bound, and predict the performance of a system with
                 an additional core within 6\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "critical path analysis; performance analysis",
}

@Article{Rogers:2009:SBW,
  author =       "Brian M. Rogers and Anil Krishna and Gordon B. Bell
                 and Ken Vu and Xiaowei Jiang and Yan Solihin",
  title =        "Scaling the bandwidth wall: challenges in and avenues
                 for {CMP} scaling",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "37",
  number =       "3",
  pages =        "371--382",
  month =        jun,
  year =         "2009",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1555754.1555801",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Aug 11 18:12:55 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "As transistor density continues to grow at an
                 exponential rate in accordance to Moore's law, the goal
                 for many Chip Multi-Processor (CMP) systems is to scale
                 the number of on-chip cores proportionally.
                 Unfortunately, off-chip memory bandwidth capacity is
                 projected to grow slowly compared to the desired growth
                 in the number of cores. This creates a situation in
                 which each core will have a decreasing amount of
                 off-chip bandwidth that it can use to load its data
                 from off-chip memory. The situation in which off-chip
                 bandwidth is becoming a performance and throughput
                 bottleneck is referred to as the {\em bandwidth wall\/}
                 problem.\par

                 In this study, we seek to answer two questions: (1) to
                 what extent does the bandwidth wall problem restrict
                 future multicore scaling, and (2) to what extent are
                 various bandwidth conservation techniques able to
                 mitigate this problem. To address them, we develop a
                 simple but powerful analytical model to predict the
                 number of on-chip cores that a CMP can support given a
                 limited growth in memory traffic capacity. We find that
                 the bandwidth wall can severely limit core scaling.
                 When starting with a balanced 8-core CMP, in four
                 technology generations the number of cores can only
                 scale to 24, as opposed to 128 cores under proportional
                 scaling, without increasing the memory traffic
                 requirement. We find that various individual bandwidth
                 conservation techniques we evaluate have a wide ranging
                 impact on core scaling, and when combined together,
                 these techniques have the potential to enable
                 super-proportional core scaling for up to 4 technology
                 generations.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "analytical model; chip multi-processor; memory
                 bandwidth",
}

@Article{Whitney:2009:FTA,
  author =       "Mark G. Whitney and Nemanja Isailovic and Yatish Patel
                 and John Kubiatowicz",
  title =        "A fault tolerant, area efficient architecture for
                 {Shor}'s factoring algorithm",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "37",
  number =       "3",
  pages =        "383--394",
  month =        jun,
  year =         "2009",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1555754.1555802",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Aug 11 18:12:55 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "We optimize the area and latency of Shor's factoring
                 while simultaneously improving fault tolerance through:
                 (1) balancing the use of ancilla generators, (2)
                 aggressive optimization of error correction, and (3)
                 tuning the core adder circuits. Our custom CAD flow
                 produces detailed layouts of the physical components
                 and utilizes simulation to analyze circuits in terms of
                 area, latency, and success probability. We introduce a
                 metric, called ADCR, which is the probabilistic
                 equivalent of the classic Area-Delay product. Our error
                 correction optimization can reduce ADCR by order of
                 magnitude or more. Contrary to conventional wisdom, we
                 show that the area of an optimized quantum circuit is
                 {\em not\/} dominated exclusively by
                 error\par

                 correction. Further, our adder evaluation shows that
                 quantum carry-lookahead adders (QCLA) beat ripple-carry
                 adders in ADCR, despite being larger and more complex.
                 We conclude with what we believe is one of most
                 accurate estimates of the area and latency required for
                 1024-bit Shor's factorization: 7659 mm$^2$ for the
                 smallest circuit and 6 x 10$^8$ seconds for the fastest
                 circuit.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "CAD; control; ion trap; layout; quantum computing",
}

@Article{Putnam:2009:PPC,
  author =       "Andrew Putnam and Susan Eggers and Dave Bennett and
                 Eric Dellinger and Jeff Mason and Henry Styles and
                 Prasanna Sundararajan and Ralph Wittig",
  title =        "Performance and power of cache-based reconfigurable
                 computing",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "37",
  number =       "3",
  pages =        "395--405",
  month =        jun,
  year =         "2009",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1555754.1555804",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Aug 11 18:12:55 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Many-cache is a memory architecture that efficiently
                 supports caching in commercially available FPGAs. It
                 facilitates FPGA programming for high-performance
                 computing (HPC) developers by providing them with
                 memory performance that is greater and power
                 consumption that is less than their current CPU
                 platforms, but without sacrificing their familiar,
                 C-based programming environment.\par

                 Many-cache creates multiple, multi-banked caches on top
                 of an FGPA's small, independent memories, each
                 targeting a particular data structure or region of
                 memory in an application and each customized for the
                 memory operations that access it. The caches are
                 automatically generated from C source by the CHiMPS
                 C-to-FPGA compiler.\par

                 This paper presents the analyses and optimizations of
                 the CHiMPS compiler that construct many-cache caches.
                 An architectural evaluation of CHiMPS-generated FPGAs
                 demonstrates a performance advantage of 7.8x (geometric
                 mean) over CPU-only execution of the same source code,
                 FPGA power usage that is on average 4.1x less, and
                 consequently performance per watt that is also greater,
                 by a geometric mean of 21.3x.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "C-to-gates; C-to-hardware; caches; co-processor
                 accelerator; FPGA; many-cache; synthesis compiler",
}

@Article{Firoozshahian:2009:MSD,
  author =       "Amin Firoozshahian and Alex Solomatnikov and Ofer
                 Shacham and Zain Asgar and Stephen Richardson and
                 Christos Kozyrakis and Mark Horowitz",
  title =        "A memory system design framework: creating smart
                 memories",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "37",
  number =       "3",
  pages =        "406--417",
  month =        jun,
  year =         "2009",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1555754.1555805",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Aug 11 18:12:55 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "As CPU cores become building blocks, we see a great
                 expansion in the types of on-chip memory systems
                 proposed for CMPs. Unfortunately, designing the cache
                 and protocol controllers to support these memory
                 systems is complex, and their concurrency and latency
                 characteristics significantly affect the performance of
                 any CMP. To address this problem, this paper presents a
                 microarchitecture framework for cache and protocol
                 controllers, which can aid in generating the RTL for
                 new memory systems. The framework consists of three
                 pipelined engines' request-tracking,
                 state-manipulation, and data movement' which are
                 programmed to implement a higher-level memory model.
                 This approach simplifies the design and verification of
                 CMP systems by decomposing the memory model into
                 sequences of state and data manipulations. Moreover,
                 implementing the framework itself produces a
                 polymorphic memory system.\par

                 To validate the approach, we implemented a scalable,
                 flexible CMP in silicon. The memory system was then
                 programmed to support three disparate memory models'
                 cache coherent shared memory, streams and transactional
                 memory. Measured overheads of this approach seem
                 promising. Our system generates controllers with
                 performance overheads of less than 20\% compared to an
                 ideal controller with zero internal latency. Even the
                 overhead of directly implementing a fully programmable
                 controller was modest. While it did double the
                 controller's area, the amortized effective area in the
                 system grew by roughly 7\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "cache coherence; memory access protocol; memory
                 systems; multi-core processors; protocol controller;
                 reconfigurable architecture; stream programming;
                 transactional memory",
}

@Article{Joao:2009:FRC,
  author =       "Jos{\'e} A. Joao and Onur Mutlu and Yale N. Patt",
  title =        "Flexible reference-counting-based hardware
                 acceleration for garbage collection",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "37",
  number =       "3",
  pages =        "418--428",
  month =        jun,
  year =         "2009",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1555754.1555806",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Aug 11 18:12:55 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Languages featuring automatic memory management
                 (garbage collection) are increasingly used to write all
                 kinds of applications because they provide clear
                 software engineering and security advantages.
                 Unfortunately, garbage collection imposes a toll on
                 performance and introduces pause times, making such
                 languages less attractive for high-performance or
                 real-time applications. Much progress has been made
                 over the last five decades to reduce the overhead of
                 garbage collection, but it remains significant.\par

                 We propose a cooperative hardware-software technique to
                 reduce the performance overhead of garbage collection.
                 The key idea is to reduce the frequency of garbage
                 collection by efficiently detecting and reusing dead
                 memory space in hardware via hardware-implemented
                 reference counting. Thus, even though software garbage
                 collections are still eventually needed, they become
                 much less frequent and have less impact on overall
                 performance. Our technique is compatible with a variety
                 of software garbage collection algorithms, does not
                 break compatibility with existing software, and reduces
                 garbage collection time by 31\% on average on the Java
                 DaCapo benchmarks running on the production build of
                 the Jikes RVM, which uses a state-of-the-art
                 generational garbage collector.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "garbage collection; reference counting",
}

@Article{Pan:2009:FIF,
  author =       "Yan Pan and Prabhat Kumar and John Kim and Gokhan
                 Memik and Yu Zhang and Alok Choudhary",
  title =        "{Firefly}: illuminating future network-on-chip with
                 nanophotonics",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "37",
  number =       "3",
  pages =        "429--440",
  month =        jun,
  year =         "2009",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1555815.1555808",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Aug 11 18:12:55 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Future many-core processors will require
                 high-performance yet energy-efficient on-chip networks
                 to provide a communication substrate for the increasing
                 number of cores. Recent advances in silicon
                 nanophotonics create new opportunities for on-chip
                 networks. To efficiently exploit the benefits of
                 nanophotonics, we propose Firefly - a hybrid,
                 hierarchical network architecture. Firefly consists of
                 clusters of nodes that are connected using
                 conventional, electrical signaling while the
                 inter-cluster communication is done using nanophotonics
                 - exploiting the benefits of electrical signaling for
                 short, local communication while nanophotonics is used
                 only for global communication to realize an efficient
                 on-chip network. Crossbar architecture is used for
                 inter-cluster communication. However, to avoid global
                 arbitration, the crossbar is partitioned into multiple,
                 logical crossbars and their arbitration is localized.
                 Our evaluations show that Firefly improves the
                 performance by up to 57\% compared to an all-electrical
                 concentrated mesh (CMESH) topology on adversarial
                 traffic patterns and up to 54\% compared to an
                 all-optical crossbar (OP XBAR) on traffic patterns with
                 locality. If the energy-delay-product is compared,
                 Firefly improves the efficiency of the on-chip network
                 by up to 51\% and 38\% compared to CMESH and OP XBAR,
                 respectively.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "hierarchical network; interconnection networks;
                 nanophotonics; topology",
}

@Article{Cianchetti:2009:PRT,
  author =       "Mark J. Cianchetti and Joseph C. Kerekes and David H.
                 Albonesi",
  title =        "{Phastlane}: a rapid transit optical routing network",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "37",
  number =       "3",
  pages =        "441--450",
  month =        jun,
  year =         "2009",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1555754.1555809",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Aug 11 18:12:55 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Tens and eventually hundreds of processing cores are
                 projected to be integrated onto future microprocessors,
                 making the global interconnect a key component to
                 achieving scalable chip performance within a given
                 power envelope. While CMOS-compatible nanophotonics has
                 emerged as a leading candidate for replacing global
                 wires beyond the 22nm timeframe, on-chip optical
                 interconnect architectures proposed thus far are either
                 limited in scalability or are dependent on
                 comparatively slow electrical control networks.\par

                 In this paper, we present Phastlane, a hybrid
                 electrical/optical routing network for future large
                 scale, cache coherent multicore microprocessors. The
                 heart of the Phastlane network is a low-latency optical
                 crossbar that uses simple predecoded source routing to
                 transmit cache-line-sized packets several hops in a
                 single clock cycle under contentionless conditions.
                 When contention exists, the router makes use of
                 electrical buffers and, if necessary, a high speed drop
                 signaling network. Overall, Phastlane achieve 2X better
                 network performance than a state-of-the-art electrical
                 baseline while consuming 80\% less network power.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "interconnection networks; multicore; nanophotonics;
                 optical interconnects",
}

@Article{Abts:2009:APP,
  author =       "Dennis Abts and Natalie D. Enright Jerger and John Kim
                 and Dan Gibson and Mikko H. Lipasti",
  title =        "Achieving predictable performance through better
                 memory controller placement in many-core {CMPs}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "37",
  number =       "3",
  pages =        "451--461",
  month =        jun,
  year =         "2009",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1555754.1555810",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Aug 11 18:12:55 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "In the near term, Moore's law will continue to provide
                 an increasing number of transistors and therefore an
                 increasing number of on-chip cores. Limited pin
                 bandwidth prevents the integration of a large number of
                 memory controllers on-chip. With many cores, and few
                 memory controllers, where to locate the memory
                 controllers in the on-chip interconnection fabric
                 becomes an important and as yet unexplored question. In
                 this paper we show how the location of the memory
                 controllers can reduce contention (hot spots) in the
                 on-chip fabric and lower the variance in reference
                 latency. This in turn provides predictable performance
                 for memory-intensive applications regardless of the
                 processing core on which a thread is scheduled. We
                 explore the design space of on-chip fabrics to find
                 optimal memory controller placement relative to
                 different topologies (i.e. mesh and torus), routing
                 algorithms, and workloads.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "chip multiprocessors; interconnection networks; memory
                 controllers; routing algorithms",
}

@Article{Luo:2009:DPT,
  author =       "Yangchun Luo and Venkatesan Packirisamy and Wei-Chung
                 Hsu and Antonia Zhai and Nikhil Mungre and Ankit
                 Tarkas",
  title =        "Dynamic performance tuning for speculative threads",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "37",
  number =       "3",
  pages =        "462--473",
  month =        jun,
  year =         "2009",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1555754.1555812",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Aug 11 18:12:55 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "In response to the emergence of multicore processors,
                 various novel and sophisticated execution models have
                 been introduced to fully utilize these processors. One
                 such execution model is Thread-Level Speculation (TLS),
                 which allows potentially dependent threads to execute
                 speculatively in parallel. While TLS offers significant
                 performance potential for applications that are
                 otherwise non-parallel, extracting efficient
                 speculative threads in the presence of complex control
                 flow and ambiguous data dependences is a real
                 challenge. This task is further complicated by the fact
                 that the performance of speculative threads is often
                 architecture-dependent, input-sensitive, and exhibits
                 phase behaviors. Thus we propose dynamic performance
                 tuning mechanisms that determine where and how to
                 create speculative threads at runtime.\par

                 This paper describes the design, implementation, and
                 evaluation of hardware and software support that takes
                 advantage of runtime performance profiles to extract
                 efficient speculative threads. In our proposed
                 framework, speculative threads are monitored by
                 hardware-based performance counters and their
                 performance impact is estimated. The creation of
                 speculative threads is adjusted based on the
                 estimation. This paper proposes speculative threads
                 performance estimation techniques, that are capable of
                 correctly determining whether speculation can improve
                 performance for loops that corresponds to 83.8\% of
                 total loop execution time across all benchmarks. This
                 paper also examines several dynamic performance tuning
                 policies and finds that the best tuning policy achieves
                 an overall speedup of 36.8\%on a set of benchmarks from
                 SPEC2000 suite, which outperforms static thread
                 management by 9.5\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "dynamic optimization; multicore; parallelism;
                 thread-level speculation",
}

@Article{Madriles:2009:BST,
  author =       "Carlos Madriles and Pedro L{\'o}pez and Josep M.
                 Codina and Enric Gibert and Fernando Latorre and
                 Alejandro Martinez and Ra{\'u}l Martinez and Antonio
                 Gonzalez",
  title =        "Boosting single-thread performance in multi-core
                 systems through fine-grain multi-threading",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "37",
  number =       "3",
  pages =        "474--483",
  month =        jun,
  year =         "2009",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1555754.1555813",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Aug 11 18:12:55 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Industry has shifted towards multi-core designs as we
                 have hit the memory and power walls. However, single
                 thread performance remains of paramount importance
                 since some applications have limited thread-level
                 parallelism (TLP), and even a small part with limited
                 TLP impose important constraints to the global
                 performance, as explained by Amdahl's law.\par

                 In this paper we propose a novel approach for
                 leveraging multiple cores to improve single-thread
                 performance in a multi-core design. The proposed
                 technique features a set of novel hardware mechanisms
                 that support the execution of threads generated at
                 compile time. These threads result from a fine-grain
                 speculative decomposition of the original application
                 and they are executed under a modified multi-core
                 system that includes: (1) mechanisms to support
                 multiple versions; (2) mechanisms to detect violations
                 among threads; (3) mechanisms to reconstruct the
                 original sequential order; and (4) mechanisms to
                 checkpoint the architectural state and recovery to
                 handle misspeculations.\par

                 The proposed scheme outperforms previous hardware-only
                 schemes to implement the idea of combining cores for
                 executing single-thread applications in a multi-core
                 design by more than 10\% on average on Spec2006 for all
                 configurations. Moreover, single-thread performance is
                 improved by 41\% on average when the proposed scheme is
                 used on a Tiny Core, and up to 2.6x for some selected
                 applications.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "automatic parallelization; core-fusion; multicore;
                 single-thread performance; speculative multithreading;
                 thread-level parallelism",
}

@Article{Chaudhry:2009:SST,
  author =       "Shailender Chaudhry and Robert Cypher and Magnus Ekman
                 and Martin Karlsson and Anders Landin and Sherman Yip
                 and H{\aa}kan Zeffer and Marc Tremblay",
  title =        "Simultaneous speculative threading: a novel pipeline
                 architecture implemented in {Sun}'s {Rock} processor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "37",
  number =       "3",
  pages =        "484--495",
  month =        jun,
  year =         "2009",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1555815.1555814",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Aug 11 18:12:55 MDT 2009",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "This paper presents Simultaneous Speculative Threading
                 (SST), which is a technique for creating
                 high-performance area- and power-efficient cores for
                 chip multiprocessors. SST hardware dynamically extracts
                 two threads of execution from a single sequential
                 program (one consisting of a load miss and its
                 dependents, and the other consisting of the
                 instructions that are independent of the load miss) and
                 executes them in parallel. SST uses an efficient
                 checkpointing mechanism to eliminate the need for
                 complex and power-inefficient structures such as
                 register renaming logic, reorder buffers, memory
                 disambiguation buffers, and large issue windows.
                 Simulations of certain SST implementations show 18\%
                 better per-thread performance on commercial benchmarks
                 than larger and higher-powered out-of-order cores. Sun
                 Microsystems' ROCK processor, which is the first
                 processor to use SST cores, has been implemented and is
                 scheduled to be commercially available in 2009.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "checkpoint-based architecture; chip multiprocessor;
                 CMP; hardware speculation; instruction-level
                 parallelism; memory-level parallelism; processor
                 architecture; SST",
}

@Article{Thomasian:2009:PSS,
  author =       "Alexander Thomasian",
  title =        "Publications on storage and systems research",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "37",
  number =       "4",
  pages =        "1--26",
  month =        sep,
  year =         "2009",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Mar 15 19:03:39 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Musoll:2009:MBM,
  author =       "Enric Musoll",
  title =        "Mesh-based many-core performance under process
                 variations: a core yield perspective",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "37",
  number =       "4",
  pages =        "27--34",
  month =        sep,
  year =         "2009",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Mar 15 19:03:39 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Nikolov:2009:QTM,
  author =       "Angel V. Nikolov",
  title =        "Queuing theoretic model for a multiprocessor with
                 private caches and shared memory",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "37",
  number =       "4",
  pages =        "35--44",
  month =        sep,
  year =         "2009",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Mar 15 19:03:39 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Thorson:2009:INb,
  author =       "Mark Thorson",
  title =        "{Internet} nuggets",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "37",
  number =       "4",
  pages =        "45--51",
  month =        sep,
  year =         "2009",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Mar 15 19:03:39 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Musoll:2009:LSO,
  author =       "Enric Musoll",
  title =        "Leakage-saving opportunities in mesh-based massive
                 multi-core architectures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "37",
  number =       "5",
  pages =        "1--7",
  month =        dec,
  year =         "2009",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1755235.1755237",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Apr 8 18:42:25 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "When processing multi-threaded workloads requiring
                 significant inter-thread communication, opportunities
                 to reduce power consumption arise due to the large
                 latencies in obtaining data from the threads running on
                 remote cores and the lack of architectural resources
                 implemented in the simple cores to cover these
                 latencies.\par

                 In this work we propose to use the drowsy mode
                 technique to save leakage power on the cores and
                 leverage the mesh-based communication fabric to hide
                 the wake-up latency of the core blocks. We have
                 observed a potential for reducing the overall power of
                 around 70\% in a generic homogeneous 256-core
                 tile-based multi-core architecture.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Naeem:2009:SRC,
  author =       "Abdul Naeem and Xiaowen Chen and Zhonghai Lu and Axel
                 Jantsch",
  title =        "Scalability of relaxed consistency models in {NoC}
                 based multicore architectures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "37",
  number =       "5",
  pages =        "8--15",
  month =        dec,
  year =         "2009",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1755235.1755238",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Apr 8 18:42:25 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "This paper studies realization of relaxed memory
                 consistency models in the network-on-chip based
                 distributed shared memory (DSM) multi-core systems.
                 Within DSM systems, memory consistency is a critical
                 issue since it affects not only the performance but
                 also the correctness of programs. We investigate the
                 scalability of the relaxed consistency models (weak,
                 release consistency) implemented by using transaction
                 counters. Our experimental results compare the average
                 and maximum code, synchronization and data latencies of
                 the two consistency models for various network sizes
                 with regular mesh topologies. The observed latencies
                 rise for both the consistency models as the network
                 size grows. However, the scaling behaviors are
                 different. With the release consistency model these
                 latencies grow significantly slower than with the weak
                 consistency due to better optimization potential by
                 means of overlapping, reordering and program order
                 relaxations. The release consistency improves the
                 performance by 15.6\% and 26.5\% on average in the code
                 and consistency latencies over the weak consistency
                 model for the specific application, as the system grows
                 from single core to 64 cores. The latency of data
                 transactions grows 2.2 times faster on the average with
                 a weak consistency model than with a release
                 consistency model when the system scales from single
                 core to 64 core",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "distributed shared memory; memory consistency;
                 scalability; synchronization",
}

@Article{Sharma:2009:RPL,
  author =       "Sandeep Sharma and K. S. Kahlon and P. K. Bansal",
  title =        "Reliability and path length analysis of irregular
                 fault tolerant multistage interconnection network",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "37",
  number =       "5",
  pages =        "16--23",
  month =        dec,
  year =         "2009",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1755235.1755239",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Apr 8 18:42:25 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "In this paper reliability and path length analysis of
                 irregular Multistage Interconnection Networks have been
                 presented. We have examined FT(Four
                 Tree)[8],MFT(Modified Four Tree)[2],NFT(New Four
                 Tree)[4],IFT(improved Four Tree)[5],IASN(Irregular
                 Augmented Shuffle)[14] and IIASN(Improved Irregular
                 Augmented Shuffle)[3] networks in which the number of
                 switches in each stage are different in numbers and
                 also have express links[11]. Using upper and lower
                 bounds[7][13][15] for larger networks, the
                 reliability[9] in terms of mean time to failure of all
                 these networks are evaluated and compared with each
                 other. Each source is connected to destination with one
                 or multiple paths with varying path lengths in a
                 network. The path length analysis of all these networks
                 is also analyzed in this paper. A path length[8]
                 algorithm for IIASN network is also propose",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "four tree network; IIASN; multistage interconnection
                 network; network reliability; NFT; path length; upper
                 bound reliability",
}

@Article{Thorson:2009:INc,
  author =       "Mark Thorson",
  title =        "{Internet} nuggets",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "37",
  number =       "5",
  pages =        "24--30",
  month =        dec,
  year =         "2009",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1755235.1755241",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Apr 8 18:42:25 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Brewer:2010:TDR,
  author =       "Eric A. Brewer",
  title =        "Technology for developing regions: {Moore's Law} is
                 not enough",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "1",
  pages =        "1--2",
  month =        mar,
  year =         "2010",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Mar 17 14:42:04 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ipek:2010:DRM,
  author =       "Engin Ipek and Jeremy Condit and Edmund B. Nightingale
                 and Doug Burger and Thomas Moscibroda",
  title =        "Dynamically replicated memory: building reliable
                 systems from nanoscale resistive memories",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "1",
  pages =        "3--14",
  month =        mar,
  year =         "2010",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Mar 17 14:42:04 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kirman:2010:PEA,
  author =       "Nevin Kirman and Jos{\'e} F. Mart{\'\i}nez",
  title =        "A power-efficient all-optical on-chip interconnect
                 using wavelength-based oblivious routing",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "1",
  pages =        "15--28",
  month =        mar,
  year =         "2010",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Mar 17 14:42:04 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Neelakantam:2010:RSE,
  author =       "Naveen Neelakantam and David R. Ditzel and Craig
                 Zilles",
  title =        "A real system evaluation of hardware atomicity for
                 software speculation",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "1",
  pages =        "29--38",
  month =        mar,
  year =         "2010",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Mar 17 14:42:04 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Harris:2010:DFM,
  author =       "Tim Harris and Sasa Tomic and Adri{\'a}n Cristal and
                 Osman Unsal",
  title =        "Dynamic filtering: multi-purpose architecture support
                 for language runtime systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "1",
  pages =        "39--52",
  month =        mar,
  year =         "2010",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Mar 17 14:42:04 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Bergan:2010:CCR,
  author =       "Tom Bergan and Owen Anderson and Joseph Devietti and
                 Luis Ceze and Dan Grossman",
  title =        "{CoreDet}: a compiler and runtime system for
                 deterministic multithreaded execution",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "1",
  pages =        "53--64",
  month =        mar,
  year =         "2010",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Mar 17 14:42:04 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Raman:2010:SPU,
  author =       "Arun Raman and Hanjun Kim and Thomas R. Mason and
                 Thomas B. Jablin and David I. August",
  title =        "Speculative parallelization using software
                 multi-threaded transactions",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "1",
  pages =        "65--76",
  month =        mar,
  year =         "2010",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Mar 17 14:42:04 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Lee:2010:REO,
  author =       "Dongyoon Lee and Benjamin Wester and Kaushik
                 Veeraraghavan and Satish Narayanasamy and Peter M. Chen
                 and Jason Flinn",
  title =        "{Respec}: efficient online multiprocessor replay via
                 speculation and external determinism",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "1",
  pages =        "77--90",
  month =        mar,
  year =         "2010",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Mar 17 14:42:04 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Eyerman:2010:PJS,
  author =       "Stijn Eyerman and Lieven Eeckhout",
  title =        "Probabilistic job symbiosis modeling for {SMT}
                 processor scheduling",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "1",
  pages =        "91--102",
  month =        mar,
  year =         "2010",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Mar 17 14:42:04 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Shen:2010:RBV,
  author =       "Kai Shen",
  title =        "Request behavior variations",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "1",
  pages =        "103--116",
  month =        mar,
  year =         "2010",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Mar 17 14:42:04 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Johnson:2010:DCM,
  author =       "F. Ryan Johnson and Radu Stoica and Anastasia Ailamaki
                 and Todd C. Mowry",
  title =        "Decoupling contention management from scheduling",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "1",
  pages =        "117--128",
  month =        mar,
  year =         "2010",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Mar 17 14:42:04 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Zhuravlev:2010:ASR,
  author =       "Sergey Zhuravlev and Sergey Blagodurov and Alexandra
                 Fedorova",
  title =        "Addressing shared resource contention in multicore
                 processors via scheduling",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "1",
  pages =        "129--142",
  month =        mar,
  year =         "2010",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Mar 17 14:42:04 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Yuan:2010:SED,
  author =       "Ding Yuan and Haohui Mai and Weiwei Xiong and Lin Tan
                 and Yuanyuan Zhou and Shankar Pasupathy",
  title =        "{SherLog}: error diagnosis by connecting clues from
                 run-time logs",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "1",
  pages =        "143--154",
  month =        mar,
  year =         "2010",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Mar 17 14:42:04 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Weeratunge:2010:AMD,
  author =       "Dasarath Weeratunge and Xiangyu Zhang and Suresh
                 Jagannathan",
  title =        "Analyzing multicore dumps to facilitate concurrency
                 bug reproduction",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "1",
  pages =        "155--166",
  month =        mar,
  year =         "2010",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Mar 17 14:42:04 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Burckhardt:2010:RSP,
  author =       "Sebastian Burckhardt and Pravesh Kothari and Madanlal
                 Musuvathi and Santosh Nagarakatte",
  title =        "A randomized scheduler with probabilistic guarantees
                 of finding bugs",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "1",
  pages =        "167--178",
  month =        mar,
  year =         "2010",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Mar 17 14:42:04 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Zhang:2010:CDS,
  author =       "Wei Zhang and Chong Sun and Shan Lu",
  title =        "{ConMem}: detecting severe concurrency bugs through an
                 effect-oriented approach",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "1",
  pages =        "179--192",
  month =        mar,
  year =         "2010",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Mar 17 14:42:04 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Mesa-Martinez:2010:CPT,
  author =       "Francisco Javier Mesa-Martinez and Ehsan K. Ardestani
                 and Jose Renau",
  title =        "Characterizing processor thermal behavior",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "1",
  pages =        "193--204",
  month =        mar,
  year =         "2010",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Mar 17 14:42:04 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Venkatesh:2010:CCR,
  author =       "Ganesh Venkatesh and Jack Sampson and Nathan Goulding
                 and Saturnino Garcia and Vladyslav Bryksin and Jose
                 Lugo-Martinez and Steven Swanson and Michael Bedford
                 Taylor",
  title =        "Conservation cores: reducing the energy of mature
                 computations",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "1",
  pages =        "205--218",
  month =        mar,
  year =         "2010",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Mar 17 14:42:04 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Sudan:2010:MPI,
  author =       "Kshitij Sudan and Niladrish Chatterjee and David
                 Nellans and Manu Awasthi and Rajeev Balasubramonian and
                 Al Davis",
  title =        "Micro-pages: increasing {DRAM} efficiency with
                 locality-aware data placement",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "1",
  pages =        "219--230",
  month =        mar,
  year =         "2010",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Mar 17 14:42:04 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Pelley:2010:PRD,
  author =       "Steven Pelley and David Meisner and Pooya Zandevakili
                 and Thomas F. Wenisch and Jack Underwood",
  title =        "Power routing: dynamic power provisioning in the data
                 center",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "1",
  pages =        "231--242",
  month =        mar,
  year =         "2010",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Mar 17 14:42:04 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ahmad:2010:JOI,
  author =       "Faraz Ahmad and T. N. Vijaykumar",
  title =        "Joint optimization of idle and cooling power in data
                 centers while maintaining response time",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "1",
  pages =        "243--256",
  month =        mar,
  year =         "2010",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Mar 17 14:42:04 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Goodstein:2010:BAA,
  author =       "Michelle L. Goodstein and Evangelos Vlachos and Shimin
                 Chen and Phillip B. Gibbons and Michael A. Kozuch and
                 Todd C. Mowry",
  title =        "Butterfly analysis: adapting dataflow analysis to
                 dynamic parallel monitoring",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "1",
  pages =        "257--270",
  month =        mar,
  year =         "2010",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Mar 17 14:42:04 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Vlachos:2010:PEA,
  author =       "Evangelos Vlachos and Michelle L. Goodstein and
                 Michael A. Kozuch and Shimin Chen and Babak Falsafi and
                 Phillip B. Gibbons and Todd C. Mowry",
  title =        "{ParaLog}: enabling and accelerating online parallel
                 monitoring of multithreaded applications",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "1",
  pages =        "271--284",
  month =        mar,
  year =         "2010",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Mar 17 14:42:04 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hormati:2010:MMS,
  author =       "Amir H. Hormati and Yoonseo Choi and Mark Woh and
                 Manjunath Kudlur and Rodric Rabbah and Trevor Mudge and
                 Scott Mahlke",
  title =        "{MacroSS}: macro-{SIMDization} of streaming
                 applications",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "1",
  pages =        "285--296",
  month =        mar,
  year =         "2010",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Mar 17 14:42:04 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Woo:2010:CPD,
  author =       "Dong Hyuk Woo and Hsien-Hsin S. Lee",
  title =        "{COMPASS}: a programmable data prefetcher using idle
                 {GPU} shaders",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "1",
  pages =        "297--310",
  month =        mar,
  year =         "2010",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Mar 17 14:42:04 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Sanchez:2010:FAS,
  author =       "Daniel Sanchez and Richard M. Yoo and Christos
                 Kozyrakis",
  title =        "Flexible architectural support for fine-grain
                 scheduling",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "1",
  pages =        "311--322",
  month =        mar,
  year =         "2010",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Mar 17 14:42:04 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Romanescu:2010:SDV,
  author =       "Bogdan F. Romanescu and Alvin R. Lebeck and Daniel J.
                 Sorin",
  title =        "Specifying and dynamically verifying address
                 translation-aware memory consistency",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "1",
  pages =        "323--334",
  month =        mar,
  year =         "2010",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Mar 17 14:42:04 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ebrahimi:2010:FST,
  author =       "Eiman Ebrahimi and Chang Joo Lee and Onur Mutlu and
                 Yale N. Patt",
  title =        "Fairness via source throttling: a configurable and
                 high-performance fairness substrate for multi-core
                 memory systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "1",
  pages =        "335--346",
  month =        mar,
  year =         "2010",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Mar 17 14:42:04 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Gelado:2010:ADS,
  author =       "Isaac Gelado and Javier Cabezas and Nacho Navarro and
                 John E. Stone and Sanjay Patel and Wen-mei W. Hwu",
  title =        "An asymmetric distributed shared memory model for
                 heterogeneous parallel systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "1",
  pages =        "347--358",
  month =        mar,
  year =         "2010",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Mar 17 14:42:04 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Bhattacharjee:2010:ICC,
  author =       "Abhishek Bhattacharjee and Margaret Martonosi",
  title =        "Inter-core cooperative {TLB} for chip
                 multiprocessors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "1",
  pages =        "359--370",
  month =        mar,
  year =         "2010",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Mar 17 14:42:04 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Huang:2010:OES,
  author =       "Ruirui Huang and Daniel Y. Deng and G. Edward Suh",
  title =        "Orthrus: efficient software integrity protection on
                 multi-cores",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "1",
  pages =        "371--384",
  month =        mar,
  year =         "2010",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Mar 17 14:42:04 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Feng:2010:SPS,
  author =       "Shuguang Feng and Shantanu Gupta and Amin Ansari and
                 Scott Mahlke",
  title =        "Shoestring: probabilistic soft error reliability on
                 the cheap",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "1",
  pages =        "385--396",
  month =        mar,
  year =         "2010",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Mar 17 14:42:04 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Yoon:2010:VFE,
  author =       "Doe Hyun Yoon and Mattan Erez",
  title =        "Virtualized and flexible {ECC} for main memory",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "1",
  pages =        "397--408",
  month =        mar,
  year =         "2010",
  CODEN =        "CANED2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Mar 17 14:42:04 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Thomasian:2010:SRI,
  author =       "Alexander Thomasian",
  title =        "Storage research in industry and universities",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "2",
  pages =        "1--48",
  month =        may,
  year =         "2010",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1823838.1823840",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jul 6 14:11:38 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "We review activities at universities and industrial
                 research centers in the storage area, but also briefly
                 mention topics such as processor design, operating
                 systems, databases, and performance analysis. Our
                 starting point is the Berkeley RAID proposal and the
                 associated taxonomy two decades ago. Important research
                 groups are listed and key researchers are identified.
                 We pay special attention to faculty/student
                 relationships, listing PhD theses and articles related
                 to storage. We also describe innovative storage
                 products and the companies behind them. This paper
                 complements author's 'Publications in Storage and
                 Systems', ACM CAN, Sept. 2009.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Matthes:2010:RIC,
  author =       "Wolfgang Matthes",
  title =        "Resources instead of cores?",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "2",
  pages =        "49--63",
  month =        may,
  year =         "2010",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1823838.1823841",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jul 6 14:11:38 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Mapping conventional applications to multiple cores is
                 a difficult problem. To provide a general solution, it
                 is proposed to abandon the very concept of processor
                 cores and to populate the silicon real estate with less
                 complex control and operation units, designated as
                 resources. A hardware-software API is described that
                 can put into effect a practically unlimited number of
                 such resources and that allows for completely
                 describing and exploiting the inherent parallelism of
                 the application problems. The paper introduces the
                 principles of operation, discusses problems of
                 feasibility and outlines the basic philosophy behind
                 the approach. The proposed principles may lead
                 to:\par

                 * Instruction set architectures which can cope with a
                 transfinite number of hardware resources.\par

                 * Processor circuits containing resources of
                 intermediate granularity and appropriately optimized
                 interconnects.\par

                 * Considerable reduction of power consumption during
                 operation at full speed.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "computer architecture; inherent parallelism; multicore
                 processors; parallel computing; power saving",
}

@Article{Thorson:2010:INa,
  author =       "Mark Thorson",
  title =        "{Internet} nuggets",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "2",
  pages =        "64--67",
  month =        may,
  year =         "2010",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1823838.1823843",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jul 6 14:11:38 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Dally:2010:MNC,
  author =       "William J. Dally",
  title =        "Moving the needle, computer architecture research in
                 academe and industry",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "3",
  pages =        "1--1",
  month =        jun,
  year =         "2010",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1816038.1815963",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jul 6 14:11:46 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "The goal of computer architecture research is to move
                 the needle, that is to affect the future of computing
                 in a positive way. Publications, prototypes, and
                 studies are all just different means to this common
                 end. This talk will address how to move the needle in
                 academic and industrial settings discussing what works
                 and what doesn't. Our work is constrained by
                 applications, technology, and commercial reality. The
                 architecture funnel starts with many concepts that
                 proceed through stages of evaluation and refinement. A
                 relatively few successful concepts make it out the far
                 side to deployment. Most concepts fail, and good
                 researchers cut their losses early. The funnel has many
                 years of latency and good researchers aim for results
                 that are relevant beyond this latency. Academics are
                 best at the early stages of the concept funnel -- where
                 their long-term perspective and freedom from
                 constraints are advantages. Industry excels at the
                 later stages of the pipeline where resources and
                 experience are well suited to refining ideas for
                 deployment. Too often good concepts fall into a chasm
                 between the two. Good partnerships are needed to bridge
                 this chasm. This talk will give illustrate this
                 exploration of architecture research with numerous
                 examples of successes and failures. It will give
                 recommended best practices for academic and industrial
                 research. I will close with a glimpse of the future of
                 architecture.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "research",
}

@Article{Watanabe:2010:WWD,
  author =       "Yasuko Watanabe and John D. Davis and David A. Wood",
  title =        "{WiDGET: Wisconsin Decoupled Grid Execution Tiles}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "3",
  pages =        "2--13",
  month =        jun,
  year =         "2010",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1816038.1815965",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jul 6 14:11:46 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "The recent paradigm shift to multi-core systems
                 results in high system throughput within a specified
                 power budget. However, future systems still require
                 good single thread performance--no longer the
                 predominant design priority--to mitigate sequential
                 bottlenecks and/or to guarantee service-level
                 agreements. Unfortunately, near saturation in voltage
                 scaling necessitates a long-term alternative to dynamic
                 voltage and frequency scaling.\par

                 We propose an energy-proportional computing
                 infrastructure, called WiDGET, that decouples thread
                 context management from a sea of simple execution units
                 (EUs). WiDGET's decoupled design provides flexibility
                 to alter resource allocation for a particular
                 power-performance target while turning off unallocated
                 resources. In other words, WiDGET enables dynamic
                 customization of different combinations of small and/or
                 powerful cores on a single chip, consuming power in
                 proportion to the delivered performance.\par

                 Over all SPEC CPU2006 benchmarks, WiDGET provides
                 average per-thread performance that is 26\% better than
                 a Xeon-like processor while using 8\% less power.
                 WiDGET can also scale down to a level comparable to an
                 Atom-like processor, turning off resources to reduce
                 average power by 58\%. WiDGET achieves high power
                 efficiency (BIPS$^3$ /W), exceeding Xeon-like and
                 Atom-like processors by up to 2x and 21x,
                 respectively.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "hardware; instruction steering; performance; power
                 efficiency; power proportional computing",
}

@Article{Gibson:2010:FSC,
  author =       "Dan Gibson and David A. Wood",
  title =        "{Forwardflow}: a scalable core for power-constrained
                 {CMPs}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "3",
  pages =        "14--25",
  month =        jun,
  year =         "2010",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1816038.1815966",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jul 6 14:11:46 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Chip Multiprocessors (CMPs) are now commodity
                 hardware, but commoditization of parallel software
                 remains elusive. In the near term, the current trend of
                 increased core-per-socket count will continue, despite
                 a lack of parallel software to exercise the hardware.
                 Future CMPs must deliver thread-level parallelism when
                 software provides threads to run, but must also
                 continue to deliver performance gains for single
                 threads by exploiting instruction-level parallelism and
                 memory-level parallelism. However, power limitations
                 will prevent conventional cores from exploiting both
                 simultaneously.\par

                 This work presents the Forwardflow Architecture, which
                 can scale its execution logic up to run single threads,
                 or down to run multiple threads in a CMP. Forwardflow
                 dynamically builds an explicit internal dataflow
                 representation from a conventional instruction set
                 architecture, using forward dependence pointers to
                 guide instruction wakeup, selection, and issue.
                 Forwardflow's backend is organized into discrete units
                 that can be individually (de-)activated, allowing each
                 core's performance to be scaled by system software at
                 the architectural level.\par

                 On single threads, Forwardflow core scaling yields a
                 mean runtime reduction of 21\% for a 37\% increase in
                 power consumption. For multithreaded workloads, a
                 Forwardflow-based CMP allows system software to select
                 the performance point that best matches available
                 power.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "chip multiprocessor (cmp); power; scalable core",
}

@Article{Azizi:2010:EPT,
  author =       "Omid Azizi and Aqeel Mahesri and Benjamin C. Lee and
                 Sanjay J. Patel and Mark Horowitz",
  title =        "Energy-performance tradeoffs in processor architecture
                 and circuit design: a marginal cost analysis",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "3",
  pages =        "26--36",
  month =        jun,
  year =         "2010",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1816038.1815967",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jul 6 14:11:46 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Power consumption has become a major constraint in the
                 design of processors today. To optimize a processor for
                 energy-efficiency requires an examination of
                 energy-performance trade-offs in all aspects of the
                 processor design space, including both architectural
                 and circuit design choices. In this paper, we apply an
                 integrated architecture-circuit optimization framework
                 to map out energy-performance trade-offs of several
                 different high-level processor architectures. We show
                 how the joint architecture-circuit space provides a
                 trade-off range of approximately 6.5x in performance
                 for 4x energy, and we identify the optimal
                 architectures for different design objectives. We then
                 show that many of the designs in this space come at
                 very high marginal costs. Our results show that, for a
                 large range of design objectives, voltage scaling is
                 effective in efficiently trading off performance and
                 energy, and that the choice of optimal architecture and
                 circuits does not change much during voltage scaling.
                 Finally, we show that with only two designs--a
                 dual-issue in-order design and a dual-issue
                 out-of-order design, both properly optimized-a large
                 part of the energy-performance trade-off space can be
                 covered within 3\% of the optimal energy-efficiency.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "co-optimization; design space exploration; design
                 trade-offs; energy efficiency; microarchitecture;
                 optimization",
}

@Article{Hameed:2010:USI,
  author =       "Rehan Hameed and Wajahat Qadeer and Megan Wachs and
                 Omid Azizi and Alex Solomatnikov and Benjamin C. Lee
                 and Stephen Richardson and Christos Kozyrakis and Mark
                 Horowitz",
  title =        "Understanding sources of inefficiency in
                 general-purpose chips",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "3",
  pages =        "37--47",
  month =        jun,
  year =         "2010",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1816038.1815968",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jul 6 14:11:46 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Due to their high volume, general-purpose processors,
                 and now chip multiprocessors (CMPs), are much more cost
                 effective than ASICs, but lag significantly in terms of
                 performance and energy efficiency. This paper explores
                 the sources of these performance and energy overheads
                 in general-purpose processing systems by quantifying
                 the overheads of a 720p HD H.264 encoder running on a
                 general-purpose CMP system. It then explores methods to
                 eliminate these overheads by transforming the CPU into
                 a specialized system for H.264 encoding. We evaluate
                 the gains from customizations useful to broad classes
                 of algorithms, such as SIMD units, as well as those
                 specific to particular computation, such as customized
                 storage and functional units.\par

                 The ASIC is 500x more energy efficient than our
                 original four-processor CMP. Broadly applicable
                 optimizations improve performance by 10x and energy by
                 7x. However, the very low energy costs of actual core
                 ops (100s fJ in 90nm) mean that over 90\% of the energy
                 used in these solutions is still 'overhead'. Achieving
                 ASIC-like performance and efficiency requires
                 algorithm-specific optimizations. For each
                 sub-algorithm of H.264, we create a large, specialized
                 functional unit that is capable of executing 100s of
                 operations per instruction. This improves performance
                 and energy by an additional 25x and the final
                 customized CMP matches an ASIC solution's performance
                 within 3x of its energy and within comparable area.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "ASIC; chip multiprocessor; customization; energy
                 efficiency; h.264; high performance; Tensilica",
}

@Article{Barr:2010:TCS,
  author =       "Thomas W. Barr and Alan L. Cox and Scott Rixner",
  title =        "Translation caching: skip, don't walk (the page
                 table)",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "3",
  pages =        "48--59",
  month =        jun,
  year =         "2010",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1815961.1815970",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jul 6 14:11:46 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "This paper explores the design space of MMU caches
                 that accelerate virtual-to-physical address translation
                 in processor architectures, such as x86-64, that use a
                 radix tree page table. In particular, these caches
                 accelerate the page table walk that occurs after a miss
                 in the Translation Lookaside Buffer. This paper shows
                 that the most effective MMU caches are translation
                 caches, which store partial translations and allow the
                 page walk hardware to skip one or more levels of the
                 page table.\par

                 In recent years, both AMD and Intel processors have
                 implemented MMU caches. However, their implementations
                 are quite different and represent distinct points in
                 the design space. This paper introduces three new MMU
                 cache structures that round out the design space and
                 directly compares the effectiveness of all five
                 organizations. This comparison shows that two of the
                 newly introduced structures, both of which are
                 translation cache variants, are better than existing
                 structures in many situations.\par

                 Finally, this paper contributes to the age-old
                 discourse concerning the relative effectiveness of
                 different page table organizations. Generally speaking,
                 earlier studies concluded that organizations based on
                 hashing, such as the inverted page table, outperformed
                 organizations based upon radix trees for supporting
                 large virtual address spaces. However, these studies
                 did not take into account the possibility of caching
                 page table entries from the higher levels of the radix
                 tree. This paper shows that any of the five MMU cache
                 structures will reduce radix tree page table DRAM
                 accesses far below an inverted page table.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "memory management; page walk caching; TLB",
}

@Article{Jaleel:2010:HPC,
  author =       "Aamer Jaleel and Kevin B. Theobald and Simon C.
                 {Steely, Jr.} and Joel Emer",
  title =        "High performance cache replacement using re-reference
                 interval prediction {(RRIP)}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "3",
  pages =        "60--71",
  month =        jun,
  year =         "2010",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1816038.1815971",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jul 6 14:11:46 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Practical cache replacement policies attempt to
                 emulate optimal replacement by predicting the
                 re-reference interval of a cache block. The commonly
                 used LRU replacement policy always predicts a
                 near-immediate re-reference interval on cache hits and
                 misses. Applications that exhibit a distant
                 re-reference interval perform badly under LRU. Such
                 applications usually have a working-set larger than the
                 cache or have frequent bursts of references to
                 non-temporal data (called scans). To improve the
                 performance of such workloads, this paper proposes
                 cache replacement using Re-reference Interval
                 Prediction (RRIP). We propose Static RRIP (SRRIP) that
                 is scan-resistant and Dynamic RRIP (DRRIP) that is both
                 scan-resistant and thrash-resistant. Both RRIP policies
                 require only 2-bits per cache block and easily
                 integrate into existing LRU approximations found in
                 modern processors. Our evaluations using PC games,
                 multimedia, server and SPEC CPU2006 workloads on a
                 single-core processor with a 2MB last-level cache (LLC)
                 show that both SRRIP and DRRIP outperform LRU
                 replacement on the throughput metric by an average of
                 4\% and 10\% respectively. Our evaluations with over
                 1000 multi-programmed workloads on a 4-core CMP with an
                 8MB shared LLC show that SRRIP and DRRIP outperform LRU
                 replacement on the throughput metric by an average of
                 7\% and 9\% respectively. We also show that RRIP
                 outperforms LFU, the state-of the art scan-resistant
                 replacement algorithm to-date. For the cache
                 configurations under study, RRIP requires 2X less
                 hardware than LRU and 2.5X less hardware than LFU.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "replacement; scan resistance; shared cache;
                 thrashing",
}

@Article{Stuecheli:2010:VWQ,
  author =       "Jeffrey Stuecheli and Dimitris Kaseridis and David
                 Daly and Hillery C. Hunter and Lizy K. John",
  title =        "The virtual write queue: coordinating {DRAM} and
                 last-level cache policies",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "3",
  pages =        "72--82",
  month =        jun,
  year =         "2010",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1815961.1815972",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jul 6 14:11:46 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "In computer architecture, caches have primarily been
                 viewed as a means to hide memory latency from the CPU.
                 Cache policies have focused on anticipating the CPU's
                 data needs, and are mostly oblivious to the main
                 memory. In this paper, we demonstrate that the era of
                 many-core architectures has created new main memory
                 bottlenecks, and mandates a new approach: coordination
                 of cache policy with main memory characteristics. Using
                 the cache for memory optimization purposes, we propose
                 a Virtual Write Queue which dramatically expands the
                 memory controller's visibility of processor behavior,
                 at low implementation overhead. Through memory-centric
                 modification of existing policies, such as scheduled
                 writebacks, this paper demonstrates that performance
                 limiting effects of highly-threaded architectures can
                 be overcome. We show that through awareness of the
                 physical main memory layout and by focusing on writes,
                 both read and write average latency can be shortened,
                 memory power reduced, and overall system performance
                 improved. Through full-system cycle-accurate
                 simulations of SPEC cpu2006, we demonstrate that the
                 proposed Virtual Write Queue achieves an average 10.9\%
                 system-level throughput improvement on memory-intensive
                 workloads, along with an overall reduction of 8.7\% in
                 memory power across the whole suite.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "cache-replacement; CMP many-core; DDR DDR2 DDR3; DRAM;
                 DRAM-parameters; last-level-cache; memory-scheduling
                 writeback; page-mode; write-queue; write-scheduling",
}

@Article{Wilkerson:2010:RCP,
  author =       "Chris Wilkerson and Alaa R. Alameldeen and Zeshan
                 Chishti and Wei Wu and Dinesh Somasekhar and Shih-lien
                 Lu",
  title =        "Reducing cache power with low-cost, multi-bit
                 error-correcting codes",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "3",
  pages =        "83--93",
  month =        jun,
  year =         "2010",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1815961.1815973",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jul 6 14:11:46 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Technology advancements have enabled the integration
                 of large on-die embedded DRAM (eDRAM) caches. eDRAM is
                 significantly denser than traditional SRAMs, but must
                 be periodically refreshed to retain data. Like SRAM,
                 eDRAM is susceptible to device variations, which play a
                 role in determining refresh time for eDRAM cells.
                 Refresh power potentially represents a large fraction
                 of overall system power, particularly during low-power
                 states when the CPU is idle. Future designs need to
                 reduce cache power without incurring the high cost of
                 flushing cache data when entering low-power
                 states.\par

                 In this paper, we show the significant impact of
                 variations on refresh time and cache power consumption
                 for large eDRAM caches. We propose Hi-ECC, a technique
                 that incorporates multi-bit error-correcting codes to
                 significantly reduce refresh rate. Multi-bit
                 error-correcting codes usually have a complex decoder
                 design and high storage cost. Hi-ECC avoids the decoder
                 complexity by using strong ECC codes to identify and
                 disable sections of the cache with multi-bit failures,
                 while providing efficient single-bit error correction
                 for the common case. Hi-ECC includes additional
                 optimizations that allow us to amortize the storage
                 cost of the code over large data words, providing the
                 benefit of multi-bit correction at same storage cost as
                 a single-bit error-correcting (SECDED) code (2\%
                 overhead). Our proposal achieves a 93\% reduction in
                 refresh power vs. a baseline eDRAM cache without error
                 correcting capability, and a 66\% reduction in refresh
                 power vs. a system using SECDED codes.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "DRAM; ECC; eDRAM; idle power; idle states; multi-bit
                 ECC; refresh power; Vccmin",
}

@Article{Xue:2010:ICF,
  author =       "Jing Xue and Alok Garg and Berkehan Ciftcio{\u{g}}lu
                 and Jianyun Hu and Shang Wang and Ioannis Savidis and
                 Manish Jain and Rebecca Berman and Peng Liu and Michael
                 Huang and Hui Wu and Eby Friedman and Gary Wicks and
                 Duncan Moore",
  title =        "An intra-chip free-space optical interconnect",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "3",
  pages =        "94--105",
  month =        jun,
  year =         "2010",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1815961.1815975",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jul 6 14:11:46 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Continued device scaling enables microprocessors and
                 other systems-on-chip (SoCs) to increase their
                 performance, functionality, and hence, complexity.
                 Simultaneously, relentless scaling, if uncompensated,
                 degrades the performance and signal integrity of
                 on-chip metal interconnects. These systems have
                 therefore become increasingly communications-limited.
                 The communications-centric nature of future high
                 performance computing devices demands a fundamental
                 change in intra- and inter-chip interconnect
                 technologies.\par

                 Optical interconnect is a promising long term solution.
                 However, while significant progress in optical {\em
                 signaling\/} has been made in recent years, {\em
                 networking\/} issues for on-chip optical interconnect
                 still require much investigation. Taking the underlying
                 optical signaling systems as a drop-in replacement for
                 conventional electrical signaling while maintaining
                 conventional packet-switching architectures is unlikely
                 to realize the full potential of optical interconnects.
                 In this paper, we propose and study the design of a
                 fully distributed interconnect architecture based on
                 free-space optics. The architecture leverages a suite
                 of newly-developed or emerging devices, circuits, and
                 optics technologies. The interconnect avoids packet
                 relay altogether, offers an ultra-low transmission
                 latency and scalable bandwidth, and provides fresh
                 opportunities for coherency substrate designs and
                 optimizations.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "3d; free-space optical interconnect; intra-chip",
}

@Article{Das:2010:AEP,
  author =       "Reetuparna Das and Onur Mutlu and Thomas Moscibroda
                 and Chita R. Das",
  title =        "{A{\'e}rgia}: exploiting packet latency slack in
                 on-chip networks",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "3",
  pages =        "106--116",
  month =        jun,
  year =         "2010",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1816038.1815976",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jul 6 14:11:46 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Traditional Network-on-Chips (NoCs) employ simple
                 arbitration strategies, such as round-robin or
                 oldest-first, to decide which packets should be
                 prioritized in the network. This is counter-intuitive
                 since different packets can have very different effects
                 on system performance due to, e.g., different level of
                 memory-level parallelism (MLP) of applications. Certain
                 packets may be performance-critical because they cause
                 the processor to stall, whereas others may be delayed
                 for a number of cycles with no effect on
                 application-level performance as their latencies are
                 hidden by other outstanding packets' latencies. In this
                 paper, we define slack as a key measure that
                 characterizes the relative importance of a packet.
                 Specifically, the slack of a packet is the number of
                 cycles the packet can be delayed in the network with no
                 effect on execution time. This paper proposes new
                 router prioritization policies that exploit the
                 available slack of interfering packets in order to
                 accelerate performance-critical packets and thus
                 improve overall system performance. When two packets
                 interfere with each other in a router, the packet with
                 the lower slack value is prioritized. We describe
                 mechanisms to estimate slack, prevent starvation, and
                 combine slack-based prioritization with other recently
                 proposed application-aware prioritization
                 mechanisms.\par

                 We evaluate slack-based prioritization policies on a
                 64-core CMP with an 8x8 mesh NoC using a suite of 35
                 diverse applications. For a representative set of case
                 studies, our proposed policy increases average system
                 throughput by 21.0\% over the commonly-used round-robin
                 policy. Averaged over 56 randomly-generated
                 multiprogrammed workload mixes, the proposed policy
                 improves system throughput by 10.3\%, while also
                 reducing application-level unfairness by 30.8\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "arbitration; memory systems; multi-core; on-chip
                 networks; packet scheduling; prioritization",
}

@Article{Koka:2010:SPN,
  author =       "Pranay Koka and Michael O. McCracken and Herb
                 Schwetman and Xuezhe Zheng and Ron Ho and Ashok V.
                 Krishnamoorthy",
  title =        "Silicon-photonic network architectures for scalable,
                 power-efficient multi-chip systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "3",
  pages =        "117--128",
  month =        jun,
  year =         "2010",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1815961.1815977",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jul 6 14:11:46 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Scaling trends of logic, memories, and interconnect
                 networks lead towards dense many-core chips.
                 Unfortunately, process yields and reticle sizes limit
                 the scalability of large single-chip systems.
                 Multi-chip systems break free of these areal limits,
                 but in turn require enormous chip-to-chip bandwidth.
                 The 'macrochip' concept presented here integrates
                 multiple many-core processor chips in a single package
                 with silicon-photonic interconnects. This design
                 enables a multi-chip system to approach the performance
                 of a single large die.\par

                 In this paper we propose three silicon-photonic network
                 designs that provide low-power, high-bandwidth
                 inter-die communication: a static wavelength-routed
                 point-to-point network, a 'two-phase' arbitrated
                 network, and a limited-connectivity point-to-point
                 network. We also adapt two existing intra-chip
                 silicon-photonic interconnects: a token-ring-based
                 crossbar and a circuit-switched torus.\par

                 We simulate a 64-die, 512-core cache-coherent macrochip
                 using all of the above networks with synthetic kernels,
                 and kernels from Splash-2 and PARSEC. We evaluate the
                 networks on performance, optical power and complexity.
                 Despite a narrow data-path width compared to the
                 token-ring or torus, the point-to-point performs 3.3x
                 and 3.9x better respectively. We show that the
                 point-to-point is over 10x more power-efficient than
                 the other networks. We also show that, contrary to
                 electronic network designs, a point-to-point network
                 has the lowest design complexity for an inter-chip
                 silicon-photonic network.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "interconnection networks; nanophotonics",
}

@Article{Beamer:2010:RAD,
  author =       "Scott Beamer and Chen Sun and Yong-Jin Kwon and Ajay
                 Joshi and Christopher Batten and Vladimir
                 Stojanovi{\'c} and Krste Asanovi{\'c}",
  title =        "Re-architecting {DRAM} memory systems with
                 monolithically integrated silicon photonics",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "3",
  pages =        "129--140",
  month =        jun,
  year =         "2010",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1815961.1815978",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jul 6 14:11:46 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "The performance of future manycore processors will
                 only scale with the number of integrated cores if there
                 is a corresponding increase in memory bandwidth.
                 Projected scaling of electrical DRAM architectures
                 appears unlikely to suffice, being constrained by
                 processor and DRAM pin-bandwidth density and by total
                 DRAM chip power, including off-chip signaling,
                 cross-chip interconnect, and bank access energy. In
                 this work, we redesign the DRAM main memory system
                 using a proposed monolithically integrated silicon
                 photonics technology and show that our photonically
                 interconnected DRAM (PIDRAM) provides a promising
                 solution to all of these issues. Photonics can provide
                 high aggregate pin-bandwidth density through dense
                 wavelength-division multiplexing. Photonic signaling
                 provides energy-efficient communication, which we
                 exploit to not only reduce chip-to-chip interconnect
                 power but to also reduce cross-chip interconnect power
                 by extending the photonic links deep into the actual
                 PIDRAM chips. To complement these large improvements in
                 interconnect bandwidth and power, we decrease the
                 number of bits activated per bank to improve the energy
                 efficiency of the PIDRAM banks themselves. Our most
                 promising design point yields approximately a 10x power
                 reduction for a single-chip PIDRAM channel with similar
                 throughput and area as a projected future
                 electrical-only DRAM. Finally, we propose optical power
                 guiding as a new technique that allows a single PIDRAM
                 chip design to be used efficiently in several
                 multi-chip configurations that provide either increased
                 aggregate capacity or bandwidth.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "dram architecture; energy-efficiency; silicon
                 photonics",
}

@Article{Schechter:2010:UEE,
  author =       "Stuart Schechter and Gabriel H. Loh and Karin Straus
                 and Doug Burger",
  title =        "Use {ECP}, not {ECC}, for hard failures in resistive
                 memories",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "3",
  pages =        "141--152",
  month =        jun,
  year =         "2010",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1816038.1815980",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jul 6 14:11:46 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "As leakage and other charge storage limitations begin
                 to impair the scalability of DRAM, non-volatile
                 resistive memories are being developed as a potential
                 replacement. Unfortunately, current error correction
                 techniques are poorly suited to this emerging class of
                 memory technologies. Unlike DRAM, PCM and other
                 resistive memories have wear lifetimes, measured in
                 writes, that are sufficiently short to make cell
                 failures common during a system's lifetime. However,
                 resistive memories are much less susceptible to
                 transient faults than DRAM. The Hamming-based ECC codes
                 used in DRAM are designed to handle transient faults
                 with no effective lifetime limits, but ECC codes
                 applied to resistive memories would wear out faster
                 than the cells they are designed to repair. This paper
                 evaluates {\em Error-Correcting Pointers\/} (ECP), a
                 new approach to error correction optimized for memories
                 in which errors are the result of permanent cell
                 failures that occur, and are immediately detectable, at
                 write time. ECP corrects errors by permanently encoding
                 the locations of failed cells into a table and
                 assigning cells to replace them. ECP provides longer
                 lifetimes than previously proposed solutions with
                 equivalent overhead. What's more, as the level of
                 variance in cell lifetimes increases -- a likely
                 consequence of further scaling -- ECP's margin of
                 improvement over existing schemes increases.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "error correction; hard failures; memory; phase change
                 memory; resistive memories",
}

@Article{Qureshi:2010:MMS,
  author =       "Moinuddin K. Qureshi and Michele M. Franceschini and
                 Luis A. Lastras-Monta{\~n}o and John P. Karidis",
  title =        "Morphable memory system: a robust architecture for
                 exploiting multi-level phase change memories",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "3",
  pages =        "153--162",
  month =        jun,
  year =         "2010",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1815961.1815981",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jul 6 14:11:46 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Phase Change Memory (PCM) is emerging as a scalable
                 and power efficient technology to architect future main
                 memory systems. The scalability of PCM is enhanced by
                 the property that PCM devices can store multiple bits
                 per cell. While such Multi-Level Cell (MLC) devices can
                 offer high density, this benefit comes at the expense
                 of increased read latency, which can cause significant
                 performance degradation. This paper proposes {\em
                 Morphable Memory System (MMS)}, a robust architecture
                 for efficiently incorporating MLC PCM devices in main
                 memory. MMS is based on observation that memory
                 requirement varies between workloads, and systems are
                 typically over-provisioned in terms of memory capacity.
                 So, during a phase of low memory usage, some of the MLC
                 devices can be operated at fewer bits per cell to
                 obtain lower latency. When the workload requires full
                 memory capacity, these devices can be restored to high
                 density MLC operation to have full main-memory
                 capacity. We provide the runtime monitors, the
                 hardware-OS interface, and the detailed mechanism for
                 implementing MMS. Our evaluations on an 8-core 8GB MLC
                 PCM-based system show that MMS provides, on average,
                 low latency access for 95\% of all memory requests,
                 thereby improving overall system performance by 40\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "morphable memory; multi-level cell; phase change
                 memory",
}

@Article{Pritchett:2010:SHS,
  author =       "Timothy Pritchett and Mithuna Thottethodi",
  title =        "{SieveStore}: a highly-selective, ensemble-level disk
                 cache for cost-performance",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "3",
  pages =        "163--174",
  month =        jun,
  year =         "2010",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1815961.1815982",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jul 6 14:11:46 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Emerging solid-state storage media can significantly
                 improve storage performance and energy. However, the
                 high cost-per-byte of solid-state media has hindered
                 wide-spread adoption in servers. This paper proposes a
                 new, cost-effective architecture - SieveStore - which
                 enables the use of solid-state media to significantly
                 filter access to storage ensembles. Our paper makes
                 three key contributions. First, we make a case for
                 highly-selective, storage-ensemble-level disk-block
                 caching based on the highly-skewed block popularity
                 distribution and based on the dynamic nature of the
                 popular block set. Second, we identify the problem of
                 {\em allocation-writes\/} and show that selective cache
                 allocation to reduce allocation-writes - {\em
                 sieving\/} - is fundamental to enable efficient
                 ensemble-level disk-caching. Third, we propose two
                 practical variants of SieveStore. Based on week-long
                 block access traces from a storage ensemble of 13
                 servers, we find that the two components (sieving and
                 ensemble-level caching) each contribute to SieveStore's
                 cost-effectiveness. Compared to unsieved,
                 ensemble-level disk-caches, SieveStore achieves
                 significantly higher hit ratios (35\%-50\% more, on
                 average) while using only 1/7$^{th}$ the number of SSD
                 drives. Further, ensemble-level caching is strictly
                 better in cost-performance compared to per-server
                 caching.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "disk cache; flash memory; selective allocation; solid
                 state disks; storage; storage ensembles",
}

@Article{Udipi:2010:RDD,
  author =       "Aniruddha N. Udipi and Naveen Muralimanohar and
                 Niladrish Chatterjee and Rajeev Balasubramonian and Al
                 Davis and Norman P. Jouppi",
  title =        "Rethinking {DRAM} design and organization for
                 energy-constrained multi-cores",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "3",
  pages =        "175--186",
  month =        jun,
  year =         "2010",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1816038.1815983",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jul 6 14:11:46 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "DRAM vendors have traditionally optimized the
                 cost-per-bit metric, often making design decisions that
                 incur energy penalties. A prime example is the
                 overfetch feature in DRAM, where a single request
                 activates thousands of bit-lines in many DRAM chips,
                 only to return a single cache line to the CPU. The
                 focus on cost-per-bit is questionable in modern-day
                 servers where operating costs can easily exceed the
                 purchase cost. Modern technology trends are also
                 placing very different demands on the memory system:
                 (i)queuing delays are a significant component of memory
                 access time, (ii) there is a high energy premium for
                 the level of reliability expected for business-critical
                 computing, and (iii) the memory access stream emerging
                 from multi-core systems exhibits limited locality. All
                 of these trends necessitate an overhaul of DRAM
                 architecture, even if it means a slight compromise in
                 the cost-per-bit metric.\par

                 This paper examines three primary innovations. The
                 first is a modification to DRAM chip microarchitecture
                 that retains the traditional DDRx SDRAM interface.
                 Selective Bit-line Activation (SBA) waits for both RAS
                 (row address) and CAS (column address) signals to
                 arrive before activating exactly those bitlines that
                 provide the requested cache line. SBA reduces energy
                 consumption while incurring slight area and performance
                 penalties. The second innovation, Single Subarray
                 Access (SSA), fundamentally re-organizes the layout of
                 DRAM arrays and the mapping of data to these arrays so
                 that an entire cache line is fetched from a single
                 subarray. It requires a different interface to the
                 memory controller, reduces dynamic and background
                 energy (by about 6X), incurs a slight area penalty
                 (4\%), and can even lead to performance improvements
                 (54\% on average) by reducing queuing delays. The third
                 innovation further penalizes the cost-per-bit metric by
                 adding a checksum feature to each cache line. This
                 checksum error-detection feature can then be used to
                 build stronger RAID-like fault tolerance, including
                 chipkill-level reliability. Such a technique is
                 especially crucial for the SSA architecture where the
                 entire cache line is localized to a single chip. This
                 DRAM chip microarchitectural change leads to a dramatic
                 reduction in the energy and storage overheads for
                 reliability. The proposed architectures will also apply
                 to other emerging memory technologies (such as
                 resistive memories) and will be less disruptive to
                 standards, interfaces, and the design flow if they can
                 be incorporated into first-generation designs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "chipkill; dram architecture; energy-efficiency;
                 locality; subarrays",
}

@Article{Chen:2010:LPP,
  author =       "Yunji Chen and Weiwu Hu and Tianshi Chen and Ruiyang
                 Wu",
  title =        "{LReplay}: a pending period based deterministic replay
                 scheme",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "3",
  pages =        "187--197",
  month =        jun,
  year =         "2010",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1816038.1815985",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jul 6 14:11:46 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Debugging parallel program is a well-known difficult
                 problem. A promising method to facilitate debugging
                 parallel program is using hardware support to achieve
                 deterministic replay. A hardware-assisted deterministic
                 replay scheme should have a small log size, as well as
                 low design cost, to be feasible for adopting by
                 industrial processors. To achieve the goals, we propose
                 a novel and succinct hardware-assisted deterministic
                 replay scheme named LReplay. The key innovation of
                 LReplay is that instead of recording the logical time
                 orders between instructions or instruction blocks as
                 previous investigations, LReplay is built upon
                 recording the pending period information [6]. According
                 to the experimental results on Godson-3, the overall
                 log size of LReplay is about 0.55B/K-Inst (byte per
                 k-instruction) for sequential consistency, and
                 0.85B/K-Inst for Godson-3 consistency. The log size is
                 smaller in an order of magnitude than state-of-art
                 deterministic replay schemes incurring no performance
                 loss. Furthermore, LReplay only consumes about $ 1.3 \%
                 $ area of Godson-3, since it requires only trivial
                 modifications to the existing components of Godson-3.
                 The above features of LReplay demonstrate the potential
                 of integrating hardware-assisted deterministic replay
                 into future industrial processors.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "deterministic replay; DFD; global clock; multi-core
                 processor; pending period; physical time order",
}

@Article{Voskuilen:2010:TEA,
  author =       "Gwendolyn Voskuilen and Faraz Ahmad and T. N.
                 Vijaykumar",
  title =        "{Timetraveler}: exploiting acyclic races for
                 optimizing memory race recording",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "3",
  pages =        "198--209",
  month =        jun,
  year =         "2010",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1816038.1815986",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jul 6 14:11:46 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "As chip multiprocessors emerge as the prevalent
                 microprocessor architecture, support for debugging
                 shared-memory parallel programs becomes important. A
                 key difficulty is the programs' nondeterministic
                 semantics due to which replay runs of a buggy program
                 may not reproduce the bug. The non-determinism stems
                 from memory races where accesses from two threads, at
                 least one of which is a write, go to the same memory
                 location. Previous hardware schemes for memory race
                 recording log the predecessor-successor thread ordering
                 at memory races and enforce the same orderings in the
                 replay run to achieve deterministic replay. To reduce
                 the log size, the schemes exploit transitivity in the
                 orderings to avoid recording redundant orderings. To
                 reduce the log size further while requiring minimal
                 hardware, we propose {\em Timetraveler\/} which for the
                 first time exploits acyclicity of races based on the
                 key observation that an acyclic race need not be
                 recorded even if the race is not covered already by
                 transitivity. Timetraveler employs a novel and elegant
                 mechanism called {\em post-dating\/} which both ensures
                 that acyclic races, including those through the L2, are
                 eventually ordered correctly, and identifies cyclic
                 races. To address false cycles through the L2,
                 Timetraveler employs another novel mechanism called
                 {\em time-delay buffer\/} which delays the advancement
                 of the L2 banks' timestamps and thereby reduces the
                 false cycles. Using simulations, we show that
                 Timetraveler reduces the log size for commercial
                 workloads by 88\% over the best previous approach while
                 using only a 696-byte time-delay buffer.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "debugging; determinism; race recording; replay",
}

@Article{Lucia:2010:CES,
  author =       "Brandon Lucia and Luis Ceze and Karin Strauss and Shaz
                 Qadeer and Hans-J. Boehm",
  title =        "Conflict exceptions: simplifying concurrent language
                 semantics with precise hardware exceptions for
                 data-races",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "3",
  pages =        "210--221",
  month =        jun,
  year =         "2010",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1816038.1815987",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jul 6 14:11:46 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "We argue in this paper that concurrency errors should
                 be treated as exceptions, {\em i.e.}, have fail-stop
                 behavior and precise semantics. We propose an exception
                 model based on conflict of synchronization free
                 regions, which precisely detects a broad class of
                 data-races. We show that our exceptions provide enough
                 guarantees to simplify high-level programming language
                 semantics and debugging, but are significantly cheaper
                 to enforce than traditional data-race detection. To
                 make the performance cost of enforcement negligible, we
                 propose architecture support for accurately detecting
                 and precisely delivering these exceptions. We evaluate
                 the suitability of our model as well as the behavior of
                 our architectural mechanisms using the PARSEC benchmark
                 suite and commercial applications. Our results show
                 that the exception model largely reflects how
                 programmers are already writing code and that the main
                 memory, traffic and performance overheads of the
                 enforcement mechanisms we propose are very low.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "bug detection; data-races; memory consistency models;
                 multicores; threads",
}

@Article{Lucia:2010:CAS,
  author =       "Brandon Lucia and Luis Ceze and Karin Strauss",
  title =        "{ColorSafe}: architectural support for debugging and
                 dynamically avoiding multi-variable atomicity
                 violations",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "3",
  pages =        "222--233",
  month =        jun,
  year =         "2010",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1815961.1815988",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jul 6 14:11:46 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "In this paper, we propose ColorSafe, an architecture
                 that detects and dynamically avoids single- and
                 multi-variable atomicity violation bugs. The key idea
                 is to group related data into colors and then monitor
                 access interleavings in the 'color space'. This enables
                 detection of atomicity violations involving any data of
                 the same color. We leverage support for meta-data to
                 maintain color information, and signatures to
                 efficiently keep recent color access histories.
                 ColorSafe dynamically avoids atomicity violations by
                 inserting ephemeral transactions that prevent erroneous
                 interleavings. ColorSafe has two modes of operation:
                 (1) {\em debugging mode\/} makes detection more
                 precise, producing fewer false positives and collecting
                 more information; and, (2) {\em deployment mode\/}
                 provides robust, efficient dynamic bug avoidance with
                 less precise detection. This makes ColorSafe useful
                 throughout the lifetime of programs, not just during
                 development. Our results show that, in deployment mode,
                 ColorSafe is able to successfully avoid the majority of
                 multi-variable atomicity violations in bug kernels, as
                 well as in large applications (Apache and MySQL). In
                 debugging mode, ColorSafe detects bugs with few false
                 positives.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "atomicity violations; bug avoidance; concurrency
                 errors; data coloring; debugging; multi-variable",
}

@Article{Irwin:2010:SCM,
  author =       "Mary Jane Irwin",
  title =        "Shared caches in multicores: the good, the bad, and
                 the ugly",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "3",
  pages =        "234--234",
  month =        jun,
  year =         "2010",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1815961.1815990",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jul 6 14:11:46 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "As we transition from clock-frequency performance
                 scaling to performance scaling with multicores, the
                 pressure on the memory hierarchy is increasing
                 dramatically. Many different on-chip cache topologies
                 have been proposed/implemented; effective management of
                 these shared caches is crucial to multicore
                 performance.\par

                 This talk will begin with a description of a cache miss
                 classification scheme for multicores (compulsory,
                 inter-core misses, intra-core misses) that gives
                 insight into the interactions between memory
                 transactions of the different cores on a chip sharing a
                 cache. Ways to improve the on-chip cache performance
                 with architectural enhancements, compiler enhancements,
                 and runtime system enhancements will then be discussed.
                 If the application thread mapping and the on-chip
                 topology is static (i.e., does not change during
                 runtime), then compiler enhancements that support cache
                 topology aware code optimization can be used to
                 significantly improve an application's performance.
                 Results from such an augmented compiler, where the
                 topology is exposed to the compiler and where the
                 compiler also does thread-to-core mapping assignments,
                 will be presented. If the application thread mapping or
                 the on-chip topology is dynamic, then other
                 alternatives exist. For example, a thread scheduler, or
                 allocator, can make decisions about moving threads to
                 different cores during runtime in the hopes of
                 improving overall cache performance. Initial
                 experiments with the REEact system being developed by
                 researchers at Penn State--UPittsburgh--UVirginia that
                 'reacts' to hardware conditions (such as cache miss
                 rates, hot-spots, etc.) by reallocating threads at
                 runtime will be outlined. Finally, if the on-chip cache
                 topology itself is dynamic (i.e., is designed to be
                 reconfigurable at runtime), large performance benefits
                 might be obtained. However, both hardware and software
                 design challenges to realizing such a dynamic system
                 abound. Some of these challenges will be briefly
                 discussed.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "caches; multicore",
}

@Article{Meng:2010:DWS,
  author =       "Jiayuan Meng and David Tarjan and Kevin Skadron",
  title =        "Dynamic warp subdivision for integrated branch and
                 memory divergence tolerance",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "3",
  pages =        "235--246",
  month =        jun,
  year =         "2010",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1815961.1815992",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jul 6 14:11:46 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "SIMD organizations amortize the area and power of
                 fetch, decode, and issue logic across multiple
                 processing units in order to maximize throughput for a
                 given area and power budget. However, throughput is
                 reduced when a set of threads operating in lockstep (a
                 warp) are stalled due to long latency memory accesses.
                 The resulting idle cycles are extremely costly.
                 Multi-threading can hide latencies by interleaving the
                 execution of multiple warps, but deep multi-threading
                 using many warps dramatically increases the cost of the
                 register files (multi-threading depth $ \times $ SIMD
                 width), and cache contention can make performance
                 worse. Instead, intra-warp latency hiding should first
                 be exploited. This allows threads that are ready but
                 stalled by SIMD restrictions to use these idle cycles
                 and reduces the need for multi-threading among warps.
                 This paper introduces {\em dynamic warp subdivision\/}
                 (DWS), which allows a single warp to occupy more than
                 one slot in the scheduler without requiring extra
                 register file space. Independent scheduling entities
                 allow divergent branch paths to interleave their
                 execution, and allow threads that hit to run ahead. The
                 result is improved latency hiding and memory level
                 parallelism (MLP). We evaluate the technique on a
                 coherent cache hierarchy with private L1 caches and a
                 shared L2 cache. With an area overhead of less than
                 1\%, experiments with eight data-parallel benchmarks
                 show our technique improves performance on average by
                 1.7$ \times $.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "branch divergence; cache; latency hiding; memory
                 divergence; SIMD; warp",
}

@Article{Chakradhar:2010:DCC,
  author =       "Srimat Chakradhar and Murugan Sankaradas and Venkata
                 Jakkula and Srihari Cadambi",
  title =        "A dynamically configurable coprocessor for
                 convolutional neural networks",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "3",
  pages =        "247--257",
  month =        jun,
  year =         "2010",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1815961.1815993",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jul 6 14:11:46 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Convolutional neural networks (CNN) applications range
                 from recognition and reasoning (such as handwriting
                 recognition, facial expression recognition and video
                 surveillance) to intelligent text applications such as
                 semantic text analysis and natural language processing
                 applications. Two key observations drive the design of
                 a new architecture for CNN. First, CNN workloads
                 exhibit a {\em widely varying mix of three types of
                 parallelism\/}: parallelism within a convolution
                 operation, intra-output parallelism where multiple
                 input sources (features) are combined to create a
                 single output, and inter-output parallelism where
                 multiple, independent outputs (features) are computed
                 simultaneously. Workloads differ significantly across
                 different CNN applications, and across different layers
                 of a CNN. Second, the number of processing elements in
                 an architecture continues to scale (as per Moore's law)
                 much faster than off-chip memory bandwidth (or
                 pin-count) of chips. Based on these two observations,
                 we show that for a given number of processing elements
                 and off-chip memory bandwidth, a new CNN hardware
                 architecture that dynamically configures the hardware
                 on-the-fly to match the specific mix of parallelism in
                 a given workload gives the best throughput performance.
                 Our CNN compiler automatically translates high
                 abstraction network specification into a parallel
                 microprogram (a sequence of low-level VLIW
                 instructions) that is mapped, scheduled and executed by
                 the coprocessor. Compared to a 2.3 GHz quad-core, dual
                 socket Intel Xeon, 1.35 GHz C870 GPU, and a 200 MHz
                 FPGA implementation, our 120 MHz dynamically
                 configurable architecture is 4x to 8x faster. This is
                 the {\em first CNN architecture to achieve real-time
                 video stream processing\/} (25 to 30 frames per second)
                 on a wide range of object detection and recognition
                 tasks.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "convolutional neural networks; dynamic
                 reconfiguration; parallel computer architecture",
}

@Article{Blundell:2010:RTR,
  author =       "Colin Blundell and Arun Raghavan and Milo M. K.
                 Martin",
  title =        "{RETCON}: transactional repair without replay",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "3",
  pages =        "258--269",
  month =        jun,
  year =         "2010",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1815961.1815995",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jul 6 14:11:46 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Over the past decade there has been a surge of
                 academic and industrial interest in optimistic
                 concurrency, {\em i.e.\/} the speculative parallel
                 execution of code regions that have the semantics of
                 isolation. This work analyzes scalability bottlenecks
                 of workloads that use optimistic concurrency. We find
                 that one common bottleneck is updates to auxiliary
                 program data in otherwise non-conflicting operations,
                 {\em e.g.\/} reference count updates and hashtable
                 occupancy field increments.\par

                 To eliminate the performance impact of conflicts on
                 such auxiliary data, this work proposes RETCON, a
                 hardware mechanism that tracks the relationship between
                 input and output values symbolically and uses this
                 symbolic information to transparently repair the output
                 state of a transaction at commit. RETCON is inspired by
                 instruction replay-based mechanisms but exploits
                 simplifying properties of the nature of computations on
                 auxiliary data to perform repair {\em without\/}
                 replay. Our experiments show that RETCON provides
                 significant speedups for workloads that exhibit
                 conflicts on auxiliary data, including transforming a
                 transactionalized version of the Python interpreter
                 from a workload that exhibits no scaling to one that
                 exhibits near-linear scaling on 32 cores.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "parallel programming; transactional memory",
}

@Article{Lee:2010:TTD,
  author =       "Janghaeng Lee and Haicheng Wu and Madhumitha
                 Ravichandran and Nathan Clark",
  title =        "{Thread Tailor}: dynamically weaving threads together
                 for efficient, adaptive parallel applications",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "3",
  pages =        "270--279",
  month =        jun,
  year =         "2010",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1815961.1815996",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jul 6 14:11:46 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Extracting performance from modern parallel
                 architectures requires that applications be divided
                 into many different threads of execution. Unfortunately
                 selecting the appropriate number of threads for an
                 application is a daunting task. Having too many threads
                 can quickly saturate shared resources, such as cache
                 capacity or memory bandwidth, thus degrading
                 performance. On the other hand, having too few threads
                 makes inefficient use of the resources available.
                 Beyond static resource assignment, the program inputs
                 and dynamic system state (e.g., what other applications
                 are executing in the system) can have a significant
                 impact on the right number of threads to use for a
                 particular application.\par

                 To address this problem we present the Thread Tailor, a
                 dynamic system that automatically adjusts the number of
                 threads in an application to optimize system
                 efficiency. The Thread Tailor leverages offline
                 analysis to estimate what type of threads will exist at
                 runtime and the communication patterns between them.
                 Using this information Thread Tailor dynamically
                 combines threads to better suit the needs of the target
                 system. Thread Tailor adjusts not only to the
                 architecture, but also other applications in the
                 system, and this paper demonstrates that this type of
                 adjustment can lead to significantly better use of
                 thread-level parallelism in real-world architectures.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "dynamic compilation; managed parallelism; threading",
}

@Article{Hong:2010:IGP,
  author =       "Sunpyo Hong and Hyesoon Kim",
  title =        "An integrated {GPU} power and performance model",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "3",
  pages =        "280--289",
  month =        jun,
  year =         "2010",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1816038.1815998",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jul 6 14:11:46 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "GPU architectures are increasingly important in the
                 multi-core era due to their high number of parallel
                 processors. Performance optimization for multi-core
                 processors has been a challenge for programmers.
                 Furthermore, optimizing for power consumption is even
                 more difficult. Unfortunately, as a result of the high
                 number of processors, the power consumption of
                 many-core processors such as GPUs has increased
                 significantly.\par

                 Hence, in this paper, we propose an integrated power
                 and performance (IPP) prediction model for a GPU
                 architecture to predict the optimal number of active
                 processors for a given application. The basic intuition
                 is that when an application reaches the peak memory
                 bandwidth, using more cores does not result in
                 performance improvement.\par

                 We develop an empirical power model for the GPU. Unlike
                 most previous models, which require measured execution
                 times, hardware performance counters, or architectural
                 simulations, IPP predicts execution times to calculate
                 dynamic power events. We then use the outcome of IPP to
                 control the number of running cores. We also model the
                 increases in power consumption that resulted from the
                 increases in temperature.\par

                 With the predicted optimal number of active cores, we
                 show that we can save up to 22.09\%of runtime GPU
                 energy consumption and on average 10.99\% of that for
                 the five memory bandwidth-limited benchmarks.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "analytical model; CUDA; energy; GPU architecture;
                 performance; power estimation",
}

@Article{Tan:2010:CFF,
  author =       "Zhangxi Tan and Andrew Waterman and Henry Cook and
                 Sarah Bird and Krste Asanovi{\'c} and David Patterson",
  title =        "A case for {FAME}: {FPGA} architecture model
                 execution",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "3",
  pages =        "290--301",
  month =        jun,
  year =         "2010",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1815961.1815999",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jul 6 14:11:46 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Given the multicore microprocessor revolution, we
                 argue that the architecture research community needs a
                 dramatic increase in simulation capacity. We believe
                 FPGA Architecture Model Execution (FAME) simulators can
                 increase the number of useful architecture research
                 experiments per day by two orders of magnitude over
                 Software Architecture Model Execution (SAME)
                 simulators. To clear up misconceptions about FPGA-based
                 simulation methodologies, we propose a FAME taxonomy to
                 distinguish the cost-performance of variations on these
                 ideas. We demonstrate our simulation speedup claim with
                 a case study wherein we employ a prototype FAME
                 simulator, RAMP Gold, to research the interaction
                 between hardware partitioning mechanisms and operating
                 system scheduling policy. The study demonstrates FAME's
                 capabilities: we run a modern parallel benchmark suite
                 on a research operating system, simulate 64-core target
                 architectures with multi-level memory hierarchy timing
                 models, and add experimental hardware mechanisms to the
                 target machine. The simulation speedup achieved by our
                 adoption of FAME-250\times -enables experiments with
                 more realistic time scales and data set sizes than are
                 possible with SAME.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "FPGA; microprocessors; simulation",
}

@Article{Blake:2010:ETL,
  author =       "Geoffrey Blake and Ronald G. Dreslinski and Trevor
                 Mudge and Kriszti{\'a}n Flautner",
  title =        "Evolution of thread-level parallelism in desktop
                 applications",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "3",
  pages =        "302--313",
  month =        jun,
  year =         "2010",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1816038.1816000",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jul 6 14:11:46 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "As the effective limits of frequency and instruction
                 level parallelism have been reached, the strategy of
                 microprocessor vendors has changed to increase the
                 number of processing cores on a single chip each
                 generation. The implicit expectation is that software
                 developers will write their applications with
                 concurrency in mind to take advantage of this sudden
                 change in direction. In this study we analyze whether
                 software developers for laptop/desktop machines have
                 followed the recent hardware trends by creating
                 software for chip multi-processing. We conduct a study
                 of a wide range of applications on Microsoft Windows 7
                 and Apple's OS X Snow Leopard, measuring {\em Thread
                 Level Parallelism\/} on a high performance workstation
                 and a low power desktop. In addition, we explore
                 graphics processing units (GPUs) and their impact on
                 chip multi-processing. We compare our findings to a
                 study done 10 years ago which concluded that a second
                 core was sufficient to improve system responsiveness.
                 Our results on today's machines show that, 10 years
                 later, surprisingly 2-3 cores are more than adequate
                 for most applications and that the GPU often remains
                 under-utilized. However, in some application specific
                 domains an 8 core SMT system with a 240 core GPU can be
                 effectively utilized. Overall these studies suggest
                 that many-core architectures are not a natural fit for
                 current desktop/laptop applications.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "benchmarking; desktop applications; multi-core; thread
                 level parallelism",
}

@Article{Reddi:2010:WSU,
  author =       "Vijay Janapa Reddi and Benjamin C. Lee and Trishul
                 Chilimbi and Kushagra Vaid",
  title =        "{Web} search using mobile cores: quantifying and
                 mitigating the price of efficiency",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "3",
  pages =        "314--325",
  month =        jun,
  year =         "2010",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1816038.1816002",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jul 6 14:11:46 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "The commoditization of hardware, data center economies
                 of scale, and Internet-scale workload growth all demand
                 greater power efficiency to sustain scalability.
                 Traditional enterprise workloads, which are typically
                 memory and I/O bound, have been well served by chip
                 multiprocessors comprised of small, power-efficient
                 cores. Recent advances in mobile computing have led to
                 modern small cores capable of delivering even better
                 power efficiency. While these cores can deliver
                 performance-per-Watt efficiency for data center
                 workloads, small cores impact application
                 quality-of-service robustness, and flexibility, as
                 these workloads increasingly invoke computationally
                 intensive kernels. These challenges constitute the
                 price of efficiency. We quantify efficiency for an
                 industry-strength online web search engine in
                 production at both the microarchitecture- and
                 system-level, evaluating search on server and
                 mobile-class architectures using Xeon and Atom
                 processors.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "bing; energy efficiency; mobile cores; web search",
}

@Article{Soundararajan:2010:IMO,
  author =       "Vijayaraghavan Soundararajan and Jennifer M.
                 Anderson",
  title =        "The impact of management operations on the virtualized
                 datacenter",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "3",
  pages =        "326--337",
  month =        jun,
  year =         "2010",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1815961.1816003",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jul 6 14:11:46 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Virtualization has the potential to dramatically
                 reduce the total cost of ownership of datacenters and
                 increase the flexibility of deployments for
                 general-purpose workloads. If present trends continue,
                 the datacenter of the future will be largely
                 virtualized. The base platform in such a datacenter
                 will consist of physical hosts that run hypervisors,
                 and workloads will run within virtual machines on these
                 platforms. From a system management perspective, the
                 virtualized environment enables a number of new
                 workflows in the datacenter. These workflows involve
                 operations on the physical hosts themselves, such as
                 upgrading the hypervisor, as well as operations on the
                 virtual machines, such as reconfiguration or reverting
                 from snapshots. While traditional datacenter design has
                 focused on the cost vs. capability tradeoffs for the
                 end-user applications running in the datacenter, we
                 argue that the management workload from these workflows
                 must be factored into the design of the virtualized
                 datacenter.\par

                 In this paper, we examine data from real-world
                 virtualized deployments to characterize common
                 management workflows and assess their impact on
                 resource usage in the datacenter. We show that while
                 many end-user applications are fairly light on I/O
                 requirements, the management workload has considerable
                 network and disk I/O requirements. We show that the
                 management workload scales with the increasing compute
                 power in the datacenter. Finally, we discuss the
                 implications of this management workload for the
                 datacenter.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "cloud computing; datacenter management; management
                 workload; virtual machine management",
}

@Article{Abts:2010:EPD,
  author =       "Dennis Abts and Michael R. Marty and Philip M. Wells
                 and Peter Klausler and Hong Liu",
  title =        "Energy proportional datacenter networks",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "3",
  pages =        "338--347",
  month =        jun,
  year =         "2010",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1815961.1816004",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jul 6 14:11:46 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Numerous studies have shown that datacenter computers
                 rarely operate at full utilization, leading to a number
                 of proposals for creating servers that are {\em energy
                 proportional\/} with respect to the computation that
                 they are performing.\par

                 In this paper, we show that as servers themselves
                 become more energy proportional, the datacenter network
                 can become a significant fraction (up to 50\%) of
                 cluster power. In this paper we propose several ways to
                 design a high-performance datacenter network whose
                 power consumption is more proportional to the amount of
                 traffic it is moving -- that is, we propose {\em energy
                 proportional datacenter networks}.\par

                 We first show that a flattened butterfly topology
                 itself is inherently more power efficient than the
                 other commonly proposed topology for high-performance
                 datacenter networks. We then exploit the
                 characteristics of modern plesiochronous links to
                 adjust their power and performance envelopes
                 dynamically. Using a network simulator, driven by both
                 synthetic workloads and production datacenter traces,
                 we characterize and understand design tradeoffs, and
                 demonstrate an 85\% reduction in power --- which
                 approaches the ideal energy-proportionality of the
                 network.\par

                 Our results also demonstrate two challenges for the
                 designers of future network switches: (1) We show that
                 there is a significant power advantage to having
                 independent control of each unidirectional channel
                 comprising a network link, since many traffic patterns
                 show very asymmetric use, and (2) system designers
                 should work to optimize the high-speed channel designs
                 to be more energy efficient by choosing optimal data
                 rate and equalization technology. Given these
                 assumptions, we demonstrate that energy proportional
                 datacenter communication is indeed possible.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "datacenter networks; interconnection networks;
                 low-power networking",
}

@Article{Thacker:2010:IFE,
  author =       "Charles P. Thacker",
  title =        "Improving the future by examining the past",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "3",
  pages =        "348--348",
  month =        jun,
  year =         "2010",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1816038.1816006",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jul 6 14:11:46 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "During the last fifty years, the technology underlying
                 computer systems has improved dramatically. As
                 technology has evolved, designers have made a series of
                 choices in the way it was applied in computers. In some
                 cases, decisions that were made in the twentieth
                 century make less sense in the twenty-first.
                 Conversely, paths not taken might now be more
                 attractive given the state of technology today,
                 particularly in light of the limits the field is
                 facing, such as the increasing gap between processor
                 speed and storage access times and the difficulty of
                 cooling today's computers.\par

                 In this talk, I'll discuss some of these choices and
                 suggest some possible changes that might make computing
                 better in the twenty-first century.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "Turing Award",
}

@Article{Temam:2010:RNN,
  author =       "Olivier Temam",
  title =        "The rebirth of neural networks",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "3",
  pages =        "349--349",
  month =        jun,
  year =         "2010",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1816038.1816008",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jul 6 14:11:46 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "After the hype of the 1990s, where companies like
                 Intel or Philips built commercial hardware systems
                 based on neural networks, the approach quickly lost
                 ground for multiple reasons: hardware neural networks
                 were no match for software neural networks run on
                 rapidly progressing general-purpose processors, their
                 application scope was considered too limited, and even
                 progress in machine-learning theory overshadowed neural
                 networks.\par

                 However, in the past few years, a remarkable
                 convergence of trends and innovations is casting a new
                 light on neural networks and could make them valuable
                 components of future computing systems. Trends in
                 technology call for architectures which can sustain a
                 large number of defects, something neural networks are
                 intrinsically capable of. Tends in applications,
                 summarized in the recent RMS categorization, highlight
                 a number of key algorithms which are eligible to neural
                 networks implementations. At the same time, innovations
                 in technology, such as the recent realization of a
                 memristor, are creating the conditions for the
                 efficient hardware implementation of neural networks.
                 Innovations in machine learning, with the recent advent
                 of Deep Networks, have revived interest in neural
                 networks. Finally, recent findings in neurobiology
                 carry even greater prospects, where detailed
                 explanations of how complex functions, such as vision,
                 can be implemented further open up the defect-tolerance
                 and application potential of neural network
                 architectures.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "neural networks",
}

@Article{Keller:2010:NVC,
  author =       "Eric Keller and Jakub Szefer and Jennifer Rexford and
                 Ruby B. Lee",
  title =        "{NoHype}: virtualized cloud infrastructure without the
                 virtualization",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "3",
  pages =        "350--361",
  month =        jun,
  year =         "2010",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1815961.1816010",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jul 6 14:11:46 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Cloud computing is a disruptive trend that is changing
                 the way we use computers. The key underlying technology
                 in cloud infrastructures is virtualization -- so much
                 so that many consider virtualization to be one of the
                 key features rather than simply an implementation
                 detail. Unfortunately, the use of virtualization is the
                 source of a significant security concern. Because
                 multiple virtual machines run on the same server and
                 since the virtualization layer plays a considerable
                 role in the operation of a virtual machine, a malicious
                 party has the opportunity to attack the virtualization
                 layer. A successful attack would give the malicious
                 party control over the all-powerful virtualization
                 layer, potentially compromising the confidentiality and
                 integrity of the software and data of any virtual
                 machine. In this paper we propose removing the
                 virtualization layer, while retaining the key features
                 enabled by virtualization. Our NoHype architecture,
                 named to indicate the removal of the hypervisor,
                 addresses each of the key roles of the virtualization
                 layer: arbitrating access to CPU, memory, and I/O
                 devices, acting as a network device (e.g., Ethernet
                 switch), and managing the starting and stopping of
                 guest virtual machines. Additionally, we show that our
                 NoHype architecture may indeed be 'no hype' since
                 nearly all of the needed features to realize the NoHype
                 architecture are currently available as hardware
                 extensions to processors and I/O devices.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "cloud computing; hypervisor; many-core; multi-core;
                 security; system architecture; virtualization",
}

@Article{Eyerman:2010:MCS,
  author =       "Stijn Eyerman and Lieven Eeckhout",
  title =        "Modeling critical sections in {Amdahl's Law} and its
                 implications for multicore design",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "3",
  pages =        "362--370",
  month =        jun,
  year =         "2010",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1815961.1816011",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jul 6 14:11:46 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "This paper presents a fundamental law for parallel
                 performance: it shows that parallel performance is not
                 only limited by sequential code (as suggested by
                 Amdahl's law) but is also fundamentally limited by
                 synchronization through critical sections. Extending
                 Amdahl's software model to include critical sections,
                 we derive the surprising result that the impact of
                 critical sections on parallel performance can be
                 modeled as a completely sequential part and a
                 completely parallel part. The sequential part is
                 determined by the probability for entering a critical
                 section and the contention probability (i.e., multiple
                 threads wanting to enter the same critical section).
                 This fundamental result reveals at least three
                 important insights for multicore design. (i) Asymmetric
                 multicore processors deliver less performance benefits
                 relative to symmetric processors than suggested by
                 Amdahl's law, and in some cases even worse performance.
                 (ii) Amdahl's law suggests many tiny cores for optimum
                 performance in asymmetric processors, however, we find
                 that fewer but larger small cores can yield
                 substantially better performance. (iii) Executing
                 critical sections on the big core can yield substantial
                 speedups, however, performance is sensitive to the
                 accuracy of the critical section contention
                 predictor.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "Amdahl's law; analytical performance modeling;
                 critical sections; synchronization",
}

@Article{Guo:2010:RCA,
  author =       "Xiaochen Guo and Engin Ipek and Tolga Soyata",
  title =        "Resistive computation: avoiding the power wall with
                 low-leakage, {STT-MRAM} based computing",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "3",
  pages =        "371--382",
  month =        jun,
  year =         "2010",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1815961.1816012",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jul 6 14:11:46 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "As CMOS scales beyond the 45nm technology node,
                 leakage concerns are starting to limit microprocessor
                 performance growth. To keep dynamic power constant
                 across process generations, traditional MOSFET scaling
                 theory prescribes reducing supply and threshold
                 voltages in proportion to device dimensions, a practice
                 that induces an exponential increase in subthreshold
                 leakage. As a result, leakage power has become
                 comparable to dynamic power in current-generation
                 processes, and will soon exceed it in magnitude if
                 voltages are scaled down any further. Beyond this
                 inflection point, multicore processors will not be able
                 to afford keeping more than a small fraction of all
                 cores active at any given moment. Multicore scaling
                 will soon hit a power wall.\par

                 This paper presents resistive computation, a new
                 technique that aims at avoiding the power wall by
                 migrating most of the functionality of a modern
                 microprocessor from CMOS to spin-torque transfer
                 magnetoresistive RAM (STT-MRAM)---a CMOS-compatible,
                 leakage-resistant, non-volatile resistive memory
                 technology. By implementing much of the on-chip storage
                 and combinational logic using leakage-resistant,
                 scalable RAM blocks and lookup tables, and by carefully
                 re-architecting the pipeline, an STT-MRAM based
                 implementation of an eight-core Sun Niagara-like CMT
                 processor reduces chip-wide power dissipation by
                 1.7\times and leakage power by 2.1\times at the 32nm
                 technology node, while maintaining 93\% of the system
                 throughput of a CMOS-based design.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "power-efficiency; STT-MRAM",
}

@Article{Seong:2010:SRP,
  author =       "Nak Hee Seong and Dong Hyuk Woo and Hsien-Hsin S.
                 Lee",
  title =        "Security refresh: prevent malicious wear-out and
                 increase durability for phase-change memory with
                 dynamically randomized address mapping",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "3",
  pages =        "383--394",
  month =        jun,
  year =         "2010",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1815961.1816014",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jul 6 14:11:46 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Phase change memory (PCM) is an emerging memory
                 technology for future computing systems. Compared to
                 other non-volatile memory alternatives, PCM is more
                 matured to production, and has a faster read latency
                 and potentially higher storage density. The main
                 roadblock precluding PCM from being used, in
                 particular, in the main memory hierarchy, is its
                 limited write endurance. To address this issue, recent
                 studies proposed to either reduce PCM's write frequency
                 or use wear-leveling to evenly distribute writes.
                 Although these techniques can extend the lifetime of
                 PCM, most of them will not prevent deliberately
                 designed malicious codes from wearing it out quickly.
                 Furthermore, all the prior techniques did not consider
                 the circumstances of a compromised OS and its security
                 implication to the overall PCM design. A compromised OS
                 will allow adversaries to manipulate processes and
                 exploit side channels to accelerate wear-out.\par

                 In this paper, we argue that a PCM design not only has
                 to consider normal wear-out under normal application
                 behavior, most importantly, it must take the worst-case
                 scenario into account with the presence of malicious
                 exploits and a compromised OS to address the durability
                 and security issues simultaneously. In this paper, we
                 propose a novel, low-cost hardware mechanism called
                 Security Refresh to avoid information leak by
                 constantly migrating their physical locations inside
                 the PCM, obfuscating the actual data placement from
                 users and system software. It uses a dynamic randomized
                 address mapping scheme that swaps data using random
                 keys upon each refresh due. The hardware overhead is
                 tiny without using any table. The best lifetime we can
                 achieve under the worst-case malicious attack is more
                 than six years. Also, our scheme incurs around 1\%
                 performance degradation for normal program
                 operations.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "dynamic address remapping; phase change memory;
                 security; wear leveling",
}

@Article{Huang:2010:ICM,
  author =       "Ruirui Huang and G. Edward Suh",
  title =        "{IVEC}: off-chip memory integrity protection for both
                 security and reliability",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "3",
  pages =        "395--406",
  month =        jun,
  year =         "2010",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1816038.1816015",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jul 6 14:11:46 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "This paper proposes a unified off-chip memory
                 integrity protection scheme, named IVEC. Today, a
                 system needs two independent mechanisms in order to
                 protect the memory integrity from both physical attacks
                 and random errors. Integrity verification schemes
                 detect malicious tampering of memory while error
                 correcting codes (ECC) detect and correct random
                 errors. IVEC enables both detection of malicious
                 attacks for security and correction of random errors
                 for reliability at the same time by extending the
                 integrity verification techniques. Analytical and
                 experimental studies show that IVEC can correct
                 single-bit errors and even multi-bit errors from one
                 DRAM chip within a cache block read without any
                 additional ECC bits, when the integrity verification is
                 also required for security, effectively removing the
                 memory and bandwidth overheads (12.5\%) of typical ECC
                 schemes. Alternatively, with parity bits, IVEC can
                 provide even stronger error correction capabilities
                 comparable to the traditional chip-kill correct, still
                 with less overheads. For both cases, IVEC can use
                 standard non-ECC DIMMs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "error correction; error detection; fault tolerance;
                 memory systems; reliability; security",
}

@Article{Shriraman:2010:SLW,
  author =       "Arrvindh Shriraman and Sandhya Dwarkadas",
  title =        "{Sentry}: light-weight auxiliary memory access
                 control",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "3",
  pages =        "407--418",
  month =        jun,
  year =         "2010",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1816038.1816016",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jul 6 14:11:46 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Light-weight, flexible access control, which allows
                 software to regulate reads and writes to any
                 granularity of memory region, can help improve the
                 reliability of today's multi-module multi-programmer
                 applications, as well as the efficiency of software
                 debugging tools. Unfortunately, access control in
                 today's processors is tied to support for virtual
                 memory, making its use both heavy weight and coarse
                 grain. In this paper, we propose Sentry, an auxiliary
                 level of virtual memory tagging that is entirely
                 subordinate to existing virtual memory-based protection
                 mechanisms and can be manipulated at the user level. We
                 implement these tags in a complexity-effective manner
                 using an M-cache (metadata cache) structure that only
                 intervenes on L1 misses, thereby minimizing changes to
                 the processor core. Existing cache coherence states are
                 repurposed to implicitly validate permissions for L1
                 hits. Sentry achieves its goal of flexible and
                 light-weight access control without disrupting existing
                 inter-application protection, sidestepping the
                 challenges associated with adding a new protection
                 framework to an existing operating system.\par

                 We illustrate the benefits of our design point using
                 (1) an Apache-based web server that uses the M-cache to
                 enforce protection boundaries among its modules and (2)
                 a watchpoint-based tool to demonstrate low-overhead
                 debugging. Protection is achieved with very few changes
                 to the source code, no changes to the programming
                 model, minimal modifications to the operating system,
                 and with low overhead incurred only when accessing
                 memory regions for which the additional level of access
                 control is enabled.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "access control; cache coherence; memory protection;
                 multiprocessors; protection domains; safety; sentry",
}

@Article{Herrero:2010:ECC,
  author =       "Enric Herrero and Jos{\'e} Gonz{\'a}lez and Ramon
                 Canal",
  title =        "Elastic cooperative caching: an autonomous dynamically
                 adaptive memory hierarchy for chip multiprocessors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "3",
  pages =        "419--428",
  month =        jun,
  year =         "2010",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1816038.1816018",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jul 6 14:11:46 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Next generation tiled microarchitectures are going to
                 be limited by off-chip misses and by on-chip network
                 usage. Furthermore, these platforms will run an
                 heterogeneous mix of applications with very different
                 memory needs, leading to significant optimization
                 opportunities. Existing adaptive memory hierarchies use
                 either centralized structures that limit the
                 scalability or software based resource allocation that
                 increases programming complexity.\par

                 We propose Elastic Cooperative Caching, a dynamic and
                 scalable memory hierarchy that adapts automatically and
                 autonomously to application behavior for each node. Our
                 configuration uses elastic shared/private caches with
                 fully autonomous and distributed repartitioning units
                 for better scalability. Furthermore, we have extended
                 our elastic configuration with an Adaptive Spilling
                 mechanism to use the shared cache space only when it
                 can produce a performance improvement. Elastic caches
                 allow both the creation of big local private caches for
                 threads with high reuse of private data and the
                 creation of big shared spaces from unused caches. Local
                 data allocation in private regions allows to reduce
                 network usage and efficient cache partitioning allows
                 to reduce off-chip misses.\par

                 The proposed scheme outperforms previous proposals by a
                 minimum of 12\% (on average across the benchmarks) and
                 reduces the number of offchip misses by 16\%. Plus, the
                 dynamic and autonomous management of cache resources
                 avoids the reallocation of cache blocks without reuse
                 which results in an increase in energy efficiency of
                 24\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "chip multiprocessors; elastic cooperative caching;
                 memory hierarchy; tiled microarchitectures",
}

@Article{Kelm:2010:CHM,
  author =       "John H. Kelm and Daniel R. Johnson and William Tuohy
                 and Steven S. Lumetta and Sanjay J. Patel",
  title =        "{Cohesion}: a hybrid memory model for accelerators",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "3",
  pages =        "429--440",
  month =        jun,
  year =         "2010",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1815961.1816019",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jul 6 14:11:46 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Two broad classes of memory models are available
                 today: models with hardware cache coherence, used in
                 conventional chip multiprocessors, and models that rely
                 upon software to manage coherence, found in compute
                 accelerators. In some systems, both types of models are
                 supported using disjoint address spaces and/or physical
                 memories. In this paper we present Cohesion, a hybrid
                 memory model that enables fine-grained temporal
                 reassignment of data between hardware-managed and
                 software-managed coherence domains, allowing a system
                 to support both. Cohesion can be used to dynamically
                 adapt to the sharing needs of both applications and
                 runtimes. Cohesion requires neither copy operations nor
                 multiple address spaces.\par

                 Cohesion offers the benefits of reduced message traffic
                 and on-die directory overhead when software-managed
                 coherence can be used and the advantages of hardware
                 coherence for cases in which software-managed coherence
                 is impractical. We demonstrate our protocol using a
                 hierarchical, cached 1024-core processor with a single
                 address space that supports both software-enforced
                 coherence and a directory-based hardware coherence
                 protocol. Relative to an optimistic, hardware-coherent
                 baseline, a realizable Cohesion design achieves
                 competitive performance with a 2\times reduction in
                 message traffic, 2.1\times reduction in directory
                 utilization, and greater robustness to on-die directory
                 capacity.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "accelerator; cache coherence; computer architecture",
}

@Article{Suleman:2010:DMM,
  author =       "M. Aater Suleman and Onur Mutlu and Jos{\'e} A. Joao
                 and Khubaib and Yale N. Patt",
  title =        "Data marshaling for multi-core architectures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "3",
  pages =        "441--450",
  month =        jun,
  year =         "2010",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1816038.1816020",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jul 6 14:11:46 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Previous research has shown that Staged Execution
                 (SE), i.e., dividing a program into segments and
                 executing each segment at the core that has the data
                 and/or functionality to best run that segment, can
                 improve performance and save power. However, SE's
                 benefit is limited because most segments access {\em
                 inter-segment data}, i.e., data generated by the
                 previous segment. When consecutive segments run on
                 different cores, accesses to inter-segment data incur
                 cache misses, thereby reducing performance. This paper
                 proposes {\em Data Marshaling (DM)}, a new technique to
                 eliminate cache misses to inter-segment data. DM uses
                 profiling to identify instructions that generate
                 inter-segment data, and adds only 96 bytes/core of
                 storage overhead. We show that DM significantly
                 improves the performance of two promising Staged
                 Execution models, Accelerated Critical Sections and
                 producer-consumer pipeline parallelism, on both
                 homogeneous and heterogeneous multi-core systems. In
                 both models, DM can achieve almost all of the potential
                 of ideally eliminating cache misses to inter-segment
                 data. DM's performance benefit increases with the
                 number of cores.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "cmp; critical sections; pipelining; staged execution",
}

@Article{Lee:2010:DGV,
  author =       "Victor W. Lee and Changkyu Kim and Jatin Chhugani and
                 Michael Deisher and Daehyun Kim and Anthony D. Nguyen
                 and Nadathur Satish and Mikhail Smelyanskiy and
                 Srinivas Chennupaty and Per Hammarlund and Ronak
                 Singhal and Pradeep Dubey",
  title =        "Debunking the {100X} {GPU} vs. {CPU} myth: an
                 evaluation of throughput computing on {CPU} and {GPU}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "3",
  pages =        "451--460",
  month =        jun,
  year =         "2010",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1816038.1816021",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jul 6 14:11:46 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Recent advances in computing have led to an explosion
                 in the amount of data being generated. Processing the
                 ever-growing data in a timely manner has made
                 throughput computing an important aspect for emerging
                 applications. Our analysis of a set of important
                 throughput computing kernels shows that there is an
                 ample amount of parallelism in these kernels which
                 makes them suitable for today's multi-core CPUs and
                 GPUs. In the past few years there have been many
                 studies claiming GPUs deliver substantial speedups
                 (between 10X and 1000X) over multi-core CPUs on these
                 kernels. To understand where such large performance
                 difference comes from, we perform a rigorous
                 performance analysis and find that after applying
                 optimizations appropriate for both CPUs and GPUs the
                 performance gap between an Nvidia GTX280 processor and
                 the Intel Core i7-960 processor narrows to only 2.5x on
                 average. In this paper, we discuss optimization
                 techniques for both CPU and GPU, analyze what
                 architecture features contributed to performance
                 differences between the two architectures, and
                 recommend a set of architectural features which provide
                 significant improvement in architectural efficiency for
                 throughput kernels.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "CPU architecture; GPU architecture; performance
                 analysis; performance measurement; software
                 optimization; throughput computing",
}

@Article{Sridharan:2010:UHV,
  author =       "Vilas Sridharan and David R. Kaeli",
  title =        "Using hardware vulnerability factors to enhance {AVF}
                 analysis",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "3",
  pages =        "461--472",
  month =        jun,
  year =         "2010",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1815961.1816023",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jul 6 14:11:46 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Fault tolerance is now a primary design constraint for
                 all major microprocessors. One step in determining a
                 processor's compliance to its failure rate target is
                 measuring the Architectural Vulnerability Factor (AVF)
                 of each on-chip structure. The AVF of a hardware
                 structure is the probability that a fault in the
                 structure will affect the output of a program. While
                 AVF generates meaningful insight into system behavior,
                 it cannot quantify the vulnerability of an individual
                 system component (hardware, user program, etc.),
                 limiting the amount of insight that can be generated.
                 To address this, prior work has introduced the Program
                 Vulnerability Factor (PVF) to quantify the
                 vulnerability of software. In this paper, we introduce
                 and analyze the Hardware Vulnerability Factor (HVF) to
                 quantify the vulnerability of hardware.\par

                 HVF has three concrete benefits which we examine in
                 this paper. First, HVF analysis can provide insight to
                 hardware designers beyond that gained from AVF analysis
                 alone. Second, separating AVF analysis into HVF and PVF
                 steps can accelerate the AVF measurement process.
                 Finally, HVF measurement enables runtime AVF estimation
                 that combines compile-time PVF estimates with runtime
                 HVF measurements. A key benefit of this technique is
                 that it allows software developers to influence the
                 runtime AVF estimates. We demonstrate that this
                 technique can estimate AVF at runtime with an average
                 absolute error of less than 3\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "architectural vulnerability factor; fault tolerance;
                 reliability",
}

@Article{Ansari:2010:NES,
  author =       "Amin Ansari and Shuguang Feng and Shantanu Gupta and
                 Scott Mahlke",
  title =        "{Necromancer}: enhancing system throughput by
                 animating dead cores",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "3",
  pages =        "473--484",
  month =        jun,
  year =         "2010",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1815961.1816024",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jul 6 14:11:46 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Aggressive technology scaling into the nanometer
                 regime has led to a host of reliability challenges in
                 the last several years. Unlike on-chip caches, which
                 can be efficiently protected using conventional
                 schemes, the general core area is less homogeneous and
                 structured, making tolerating defects a much more
                 challenging problem. Due to the lack of effective
                 solutions, disabling non-functional cores is a common
                 practice in industry to enhance manufacturing yield,
                 which results in a significant reduction in system
                 throughput. Although a faulty core cannot be trusted to
                 correctly execute programs, we observe in this work
                 that for most defects, when starting from a valid
                 architectural state, execution traces on a defective
                 core actually coarsely resemble those of fault-free
                 executions. In light of this insight, we propose a
                 robust and heterogeneous core coupling execution
                 scheme, Necromancer, that exploits a functionally dead
                 core to improve system throughput by supplying hints
                 regarding high-level program behavior. We partition the
                 cores in a conventional CMP system into multiple groups
                 in which each group shares a lightweight core that can
                 be substantially accelerated using these execution
                 hints from a potentially dead core. To prevent this
                 {\em undead\/} core from wandering too far from the
                 correct path of execution, we dynamically resynchronize
                 architectural state with the lightweight core. For a
                 4-core CMP system, on average, our approach enables the
                 coupled core to achieve 78.5\% of the performance of a
                 fully functioning core. This defect tolerance and
                 throughput enhancement comes at modest area and power
                 overheads of 5.3\% and 8.5\%, respectively.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "execution abstraction; heterogeneous core coupling;
                 manufacturing defects",
}

@Article{Yan:2010:LCL,
  author =       "Guihai Yan and Xiaoyao Liang and Yinhe Han and Xiaowei
                 Li",
  title =        "Leveraging the core-level complementary effects of
                 {PVT} variations to reduce timing emergencies in
                 multi-core processors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "3",
  pages =        "485--496",
  month =        jun,
  year =         "2010",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1815961.1816025",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jul 6 14:11:46 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Process, Voltage, and Temperature (PVT) variations can
                 significantly degrade the performance benefits expected
                 from next nanoscale technology. The primary circuit
                 implication of the PVT variations is the resultant
                 timing emergencies. In a multi-core processor running
                 multiple programs, variations create spatial and
                 temporal unbalance across the processing cores. Most
                 prior schemes are dedicated to tolerating PVT
                 variations individually for a single core, but ignore
                 the opportunity of leveraging the complementary effects
                 between variations and the intrinsic variation
                 unbalance among individual cores. We find that the
                 notorious delay impacts from different variations are
                 not necessary aggregated. Cores with mild variations
                 can share the violent workload from cores suffering
                 large variations. If operated correctly, variations on
                 different cores can help mitigating each other and
                 result in a variation-mild environment. In this paper,
                 we propose Timing Emergency Aware Thread Migration
                 (TEA-TM), a delay sensor-based scheme to reduce system
                 timing emergencies under PVT variations. Fourier
                 transform and frequency domain analysis are conducted
                 to provide the insights and the potential of the PVT
                 co-optimization scheme. Experimental results show on
                 average TEA-TM can help save up to 24\% throughput
                 loss, at the same time improve the system fairness by
                 85\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "complimentary effects; delay sensor; PVT variations;
                 thread migration; timing emergency",
}

@Article{deKruijf:2010:RAF,
  author =       "Marc de Kruijf and Shuou Nomura and Karthikeyan
                 Sankaralingam",
  title =        "{Relax}: an architectural framework for software
                 recovery of hardware faults",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "3",
  pages =        "497--508",
  month =        jun,
  year =         "2010",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1815961.1816026",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jul 6 14:11:46 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "As technology scales ever further, device
                 unreliability is creating excessive complexity for
                 hardware to maintain the illusion of perfect operation.
                 In this paper, we consider whether exposing hardware
                 fault information to software and allowing software to
                 control fault recovery simplifies hardware design and
                 helps technology scaling.\par

                 The combination of emerging applications and emerging
                 many-core architectures makes software recovery a
                 viable alternative to hardware-based fault recovery.
                 Emerging applications tend to have {\em few I/O and
                 memory side-effects}, which limits the amount of
                 information that needs checkpointing, and they allow
                 {\em discarding individual sub-computations\/} with
                 small qualitative impact. Software recovery can harness
                 these properties in ways that hardware recovery
                 cannot.\par

                 We describe Relax, an architectural framework for
                 software recovery of hardware faults. Relax includes
                 three core components: (1) an ISA extension that allows
                 software to mark regions of code for software recovery,
                 (2) a hardware organization that simplifies reliability
                 considerations and provides energy efficiency with
                 hardware recovery support removed, and (3) software
                 support for compilers and programmers to utilize the
                 Relax ISA. Applying Relax to counter the effects of
                 process variation, our results show a 20\% energy
                 efficiency improvement for PARSEC applications with
                 only minimal source code changes and simpler
                 hardware.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "reliability; software recovery",
}

@Article{Nuno-Maganda:2010:TCH,
  author =       "Marco Nu{\~n}o-Maganda and Cesar Torres-Huitzil",
  title =        "A temporal coding hardware implementation for spiking
                 neural networks",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "4",
  pages =        "2--7",
  month =        sep,
  year =         "2010",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1926367.1926369",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 20 14:27:03 MST 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Spiking Neural Networks (SNNs) models have been
                 explored in recent years due to its biological
                 plausibility where temporal coding plays an important
                 role. Biological arguments and computational
                 experiments suggest than some perceptual tasks (vision
                 and olfaction for instance) are well performed by these
                 models. Moreover, some other applications such as
                 machine learning might be benefited from this approach.
                 However, efficient simulation and implementation of
                 SNNs still remain an open challenge. There are several
                 issues that must be addressed, being one of them the
                 temporal coding of real-value data itself. In order to
                 study the possibilities of embedded real-time
                 implementations of large scale SNNs, we have first
                 chosen to implement a well-known coding scheme based on
                 Gaussian Receptive Fields (GRFs) to map real-value data
                 into spike trains.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Morisita:2010:IEA,
  author =       "Hirokazu Morisita and Kenta Inakagata and Yasunori
                 Osana and Naoyuki Fujita and Hideharu Amano",
  title =        "Implementation and evaluation of an arithmetic
                 pipeline on {FLOPS-$2$D}: multi-{FPGA} system",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "4",
  pages =        "8--13",
  month =        sep,
  year =         "2010",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1926367.1926370",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 20 14:27:03 MST 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "UPACS (Unified Platform for Aerospace Computational
                 Simulation) is one of the practical CFD (Computational
                 Fluid Dynamics) packages supporting various
                 selectability. A custom machine for efficient execution
                 of MUSCL; a core functions of UPACS is implemented on
                 FLOPS-2D (Flexibly Linkable Object for Programmable
                 System); multi-FPGA reconfigurable system. The deep and
                 complicated pipeline structure generated from MUSCL
                 dataflow is divided and optimized into two FPGA boards
                 by using a tuning tool called RER. With optimization of
                 the order of operations and pipeline structure, about
                 60\% utilization of the pipeline is achieved even by
                 using serial links between two boards.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Tse:2010:ERD,
  author =       "Anson H. T. Tse and David B. Thomas and K. H. Tsoi and
                 Wayne Luk",
  title =        "Efficient reconfigurable design for pricing {Asian}
                 options",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "4",
  pages =        "14--20",
  month =        sep,
  year =         "2010",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1926367.1926371",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 20 14:27:03 MST 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Arithmetic Asian options are financial derivatives
                 which have the feature of path-dependency: they depend
                 on the entire price path of the underlying asset,
                 rather than just the instantaneous price. This
                 path-dependency makes them difficult to price, as only
                 computationally intensive Monte-Carlo methods can
                 provide accurate prices. This paper proposes an
                 FPGA-accelerated Asian option pricing solution, using a
                 highly-optimised parallel Monte-Carlo architecture. The
                 proposed pipelined design is described parametrically,
                 facilitating its re-use for different technologies.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Horita:2010:FBF,
  author =       "Tadayoshi Horita and Itsuo Takanami",
  title =        "An {FPGA}-based fast classifier with high
                 generalization property",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "4",
  pages =        "21--26",
  month =        sep,
  year =         "2010",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1926367.1926372",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 20 14:27:03 MST 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "This paper proposes a scheme to implement classifiers
                 with high generalization properties on FPGAs. The
                 classifiers consist of only combinational logic
                 circuits, which are based on a simple concept, and the
                 VHDL source files which describe the classifiers are
                 generated by a C-language function, tuning VHDL
                 notations for adders in them to reduce both its
                 hardware size and computation time. Simulation results
                 based on a character recognition are shown in terms of
                 generalization property, hardware size, computation
                 time, and electricity consumption.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Putnam:2010:DVE,
  author =       "Andrew Putnam and Aaron Smith and Doug Burger",
  title =        "Dynamic vectorization in the {E2} dynamic multicore
                 architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "4",
  pages =        "27--32",
  month =        sep,
  year =         "2010",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1926367.1926373",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 20 14:27:03 MST 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Previous research has shown that Explicit Data Graph
                 Execution (EDGE) instruction set architectures (ISA)
                 allow for power efficient performance scaling. In this
                 paper we describe the preliminary design of a new
                 dynamic multicore processor called E2 that utilizes an
                 EDGE ISA to allow for the dynamic composition of
                 physical cores into logical processors. We provide
                 details of E2's support for dynamic reconfigurability
                 and show how the EDGE ISA facilities out-of-order
                 vector execution.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Paek:2010:BAU,
  author =       "Jong Kyung Paek and Kiyoung Choi and Jongeun Lee",
  title =        "Binary acceleration using coarse-grained
                 reconfigurable architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "4",
  pages =        "33--39",
  month =        sep,
  year =         "2010",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1926367.1926374",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 20 14:27:03 MST 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Coarse-grained reconfigurable architectures (CGRAs)
                 have been well-researched and shown to be particularly
                 effective in acceleration of data-intensive
                 applications. However, practical difficulties in
                 application mapping have hindered their widespread
                 adoption. Typically, an application must be modified
                 manually or by using special compilers and design tools
                 in order to fully exploit the architecture. This incurs
                 considerable design costs to the application developer
                 and reduces software portability. In this paper, we
                 propose a framework for automatic transformation of an
                 application at binary-level, with which the user can
                 execute an arbitrary application on the CGRA. Our
                 approach analyzes the binary code and determines which
                 portions of the program to accelerate, maps them to the
                 reconfigurable array, then modifies the binary code
                 appropriately to run on the CGRA.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Dohi:2010:IPE,
  author =       "Keisuke Dohi and Yuichiro Shibata and Tsuyoshi Hamada
                 and Tomonari Masada and Kiyoshi Oguri and Duncan A.
                 Buell",
  title =        "Implementation of a programming environment with a
                 multithread model for reconfigurable systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "4",
  pages =        "40--45",
  month =        sep,
  year =         "2010",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1926367.1926375",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 20 14:27:03 MST 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Reconfigurable systems are known to be able to achieve
                 higher performance than traditional microprocessor
                 architecture for many application fields. However, in
                 order to extract a full potential of the reconfigurable
                 systems, programmers often have to design and describe
                 the best suited code for their target architecture with
                 specialized knowledge. The aim of this paper is to
                 assist the users of reconfigurable systems by
                 implementing a translator with a multithread model. The
                 experimental results show our translator automatically
                 generates efficient performance-aware code segments
                 including DMA transfer and shift registers for memory
                 access optimization.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Sabeghi:2010:RMS,
  author =       "Mojtaba Sabeghi and Hamid Mushtaq and Koen Bertels",
  title =        "Runtime multitasking support on polymorphic
                 platforms",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "4",
  pages =        "46--52",
  month =        sep,
  year =         "2010",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1926367.1926376",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 20 14:27:03 MST 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "General purpose computers are moving towards employing
                 reconfigurable fabrics in order to achieve higher
                 performance. In such systems, serving several
                 applications at runtime is a challenging problem in
                 which the reconfigurable fabric has to be shared among
                 competing tasks. Because of the inherent complexity of
                 mapping the computation intensive tasks into the FPGA,
                 a comprehensive runtime system is required to address
                 all the conflicting issues between competing
                 applications' demands and to keep the system
                 performance at the required level. In this paper, we
                 present a runtime environment wherein a number of
                 components introduced to handle the task assignment
                 problem in a very low overhead manner.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Tsoi:2010:PFC,
  author =       "Kuen Hung Tsoi and Anson H. T. Tse and Peter Pietzuch
                 and Wayne Luk",
  title =        "Programming framework for clusters with heterogeneous
                 accelerators",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "4",
  pages =        "53--59",
  month =        sep,
  year =         "2010",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1926367.1926377",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 20 14:27:03 MST 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "We describe a programming framework for high
                 performance clusters with various hardware
                 accelerators. In this framework, users can utilize the
                 available heterogeneous resources productively and
                 efficiently. The distributed application is highly
                 modularized to support dynamic system configuration
                 with changing types and number of the accelerators.
                 Multiple layers of communication interface are
                 introduced to reduce the overhead in both control
                 messages and data transfers. Parallelism can be
                 achieved by controlling the accelerators in various
                 schemes through scheduling extension. The framework has
                 been used to support physics simulation and financial
                 application development.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Tadonki:2010:ECL,
  author =       "Claude Tadonki and Gilbert Grodidier and Olivier
                 Pene",
  title =        "An efficient {CELL} library for lattice quantum
                 chromodynamics",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "4",
  pages =        "60--65",
  month =        sep,
  year =         "2010",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1926367.1926378",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 20 14:27:03 MST 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Quantum chromodynamics (QCD) is the theory of
                 subnuclear physics, aiming at modeling the strong
                 nuclear force, which is responsible for the
                 interactions of nuclear particles. Numerical QCD
                 studies are performed through a discrete formalism
                 called LQCD (Lattice Quantum Chromodynamics). Typical
                 simulations involve very large volume of data and
                 numerically sensitive entities, thus the crucial need
                 of high performance computing systems. We propose a set
                 of CELL-accelerated routines for basic LQCD
                 calculations. Our framework is provided as a unified
                 library and is particularly optimized for an iterative
                 use. Each routine is parallelized among the SPUs, and
                 each SPU achieves it task by looping on small chunk of
                 arrays from the main memory.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Taylor:2010:SBB,
  author =       "Ryan Taylor and Xiaoming Li",
  title =        "Software-based branch predication for {AMD GPUs}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "4",
  pages =        "66--72",
  month =        sep,
  year =         "2010",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1926367.1926379",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 20 14:27:03 MST 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Branch predication is a program transformation
                 technique that combines instructions of multiple
                 branches of an if statement into a straight-line
                 sequence and associates each instruction of the
                 sequence with a predicate. The branch predication
                 improves the execution of branch statements on
                 processors that support predicated execution of
                 instruction, e.g., Intel IA-64, because such
                 transformation improves the instruction scheduling and
                 might help cache performance. This paper proposes a
                 novel software-based branch predication technique for
                 GPU. The main motivation is that branch instructions
                 can easily become a performance bottleneck for a GPU
                 program because of the cost of branch instructions
                 compared to ALU instructions and the possibility of low
                 ALU utilization due to separation of ALU instructions
                 within control flow blocks.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Banescu:2010:MFP,
  author =       "Sebastian Banescu and Florent de Dinechin and Bogdan
                 Pasca and Radu Tudoran",
  title =        "Multipliers for floating-point double precision and
                 beyond on {FPGAs}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "4",
  pages =        "73--79",
  month =        sep,
  year =         "2010",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1926367.1926380",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 20 14:27:03 MST 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "The implementation of high-precision floating-point
                 applications on reconfigurable hardware requires large
                 multipliers. Full multipliers are the core of
                 floating-point multipliers. Truncated multipliers,
                 trading resources for a well-controlled accuracy
                 degradation, are useful building blocks in situations
                 where a full multiplier is not needed.\par

                 This work studies the automated generation of such
                 multipliers using the embedded multipliers and adders
                 present in the DSP blocks of current FPGAs. The
                 optimization of such multipliers is expressed as a
                 tiling problem, where a tile represents a hardware
                 multiplier, and super-tiles represent combinations of
                 several hardware multipliers and adders, making
                 efficient use of the DSP internal resources. This
                 tiling technique is shown to adapt to full or truncated
                 multipliers. It addresses arbitrary precisions
                 including single, double but also the quadruple
                 precision introduced by the IEEE-754-2008 standard and
                 currently unsupported by processor hardware. An
                 open-source implementation is provided in the FloPoCo
                 project.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Sano:2010:PIA,
  author =       "Kentaro Sano and Luzhou Wang and Satoru Yamamoto",
  title =        "Prototype implementation of array-processor extensible
                 over multiple {FPGAs} for scalable stencil
                 computation",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "4",
  pages =        "80--86",
  month =        sep,
  year =         "2010",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1926367.1926381",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 20 14:27:03 MST 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "This paper demonstrates and evaluates the performance
                 and the scalability of the systolic
                 computational-memory array (SCMA) for stencil
                 computation, which is a typical computing kernel of
                 scientific simulation. We describe the basic
                 architecture of th SCMA, and show the requirements and
                 the design of SCMAs to scalably operate over multiple
                 devices. We implement a prototype of the SCMA with
                 three ALTERA Stratix III FPGAs, which form a 1--3 FPGA
                 array by connecting three DE3 boards with different
                 clock sources. The prototype SCMA demonstrates that the
                 difference in operating clock frequency hardly
                 influences the total execution cycles while it slightly
                 causes stall cycles to sub-SCMAs on different FPGAs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Tsang:2010:DPR,
  author =       "Chi-Chiu Tsang and Hayden Kwok-Hay So",
  title =        "Dynamic power reduction of {FPGA}-based reconfigurable
                 computers using precomputation",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "4",
  pages =        "87--92",
  month =        sep,
  year =         "2010",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1926367.1926382",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 20 14:27:03 MST 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "This paper examines the effectiveness of employing
                 precomputation techniques to reduce power consumption
                 of field configurable computing systems. Multiplier is
                 modified with precomputation techniques and are
                 implemented using commercial off-the-shelf FPGAs.
                 Precomputation techniques reduce dynamic power
                 consumption of a module by eliminating unnecessary
                 signal switching activities in inactive portions of the
                 modules. Experiments have shown that up to 52\% of
                 logic and signal power consumption can be reduced in
                 multiplier module. Furthermore, when compared to ASIC
                 implementations, FPGA implementations of precomputation
                 modules have the advantage of lower area overhead as
                 most of them can be implemented using originally
                 unoccupied related FPGA resources.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Thorson:2010:INb,
  author =       "Mark Thorson",
  title =        "{Internet} nuggets",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "4",
  pages =        "93--96",
  month =        sep,
  year =         "2010",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1926367.1926384",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 20 14:27:03 MST 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Mukherjee:2010:NAC,
  author =       "Manideepa Mukherjee and Amitabha Sinha",
  title =        "A novel architecture for conversion of binary to
                 single digit double base numbers",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "5",
  pages =        "1--6",
  month =        dec,
  year =         "2010",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1978907.1978909",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 13 11:25:46 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Double base number systems are increasingly attractive
                 for many compute intensive applications especially in
                 signal processing because of their capabilities of
                 handling arithmetic operations efficiently. However,
                 the complexity involved in converting binary to DBNS
                 becomes a major bottleneck and the efficiency of
                 performance goes down drastically due to the complexity
                 involved in conversion. Since complexity of multi digit
                 DBNS multiplications and additions increases with the
                 number of digits (index i,j), in this paper a novel
                 conversion scheme has been proposed where a given
                 binary number will be converted to a single digit
                 (index i,j) double base number. The proposed scheme not
                 only reduces the hardware complexity of the arithmetic
                 operations but also reduces the time of execution.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{T:2010:DDF,
  author =       "Shobha T. and Syed Akram and G. Varaprasad",
  title =        "Design and development of framework for diagnosing
                 intermediate nodes",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "5",
  pages =        "7--11",
  month =        dec,
  year =         "2010",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1978907.1978910",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 13 11:25:46 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "A framework is an integrated system that sets the
                 rules of Automation of a specific product. This system
                 integrates the function libraries, test data sources,
                 object details and various reusable modules. This paper
                 proposes a framework, used for diagnosing and
                 performance analysis of intermediate network nodes such
                 as load balancer, routers, servers etc. For analyzing
                 the performance $m$ number of servers and $n$ number of
                 clients are considered. This framework will help
                 developers working on network nodes to check for the
                 performance of network node component and also to
                 detect the errors in the algorithms.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Tabba:2010:ACP,
  author =       "Fuad Tabba",
  title =        "Adding concurrency in {Python} using a commercial
                 processor's hardware transactional memory support",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "5",
  pages =        "12--19",
  month =        dec,
  year =         "2010",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1978907.1978911",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 13 11:25:46 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "This paper reports on our experiences of using a
                 commercial processor's best-effort hardware
                 transactional memory to improve concurrency in CPython,
                 the reference Python implementation. CPython protects
                 its data structures using a single global lock, which
                 inhibits parallelism when running multiple
                 threads.\par

                 We modified the CPython interpreter to use best-effort
                 hardware transactions available in Sun's Rock
                 processor, and fall back on the single global lock when
                 unable to commit in hardware. The modifications were
                 minimal; however, we had to restructure some of
                 CPython's shared data structures to handle false
                 conflicts arising from CPython's management of the
                 shared data. Our results show that the modified CPython
                 interpreter can run small, simple, workloads and scale
                 almost linearly, while improving the concurrency of
                 more complex workloads.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Thomasian:2010:WSD,
  author =       "Alexander Thomasian",
  title =        "Why specialized disks for composite operations may be
                 unnecessary",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "5",
  pages =        "20--27",
  month =        dec,
  year =         "2010",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1978907.1978912",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 13 11:25:46 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Disk arrays with erasure coding such as RAID5 and
                 RAID6 incur four and six disk accesses respectively for
                 updating data and check blocks. The small write penalty
                 can be reduced by the Read-Modify-Write (RMW) composite
                 operations to update data and associated check blocks.
                 The Disk Architecture with Composite Operation (DACO)
                 is a proposal to eliminate the disk rotation associated
                 with RMWs, by using a complex read/write head, which
                 allows the writing of a block immediately after reading
                 and modifying it without needing an extra disk
                 rotation. We argue that the extra cost associated with
                 DACO may not be justifiable, because it is not expected
                 to have a significant impact on RAID performance.
                 Furthermore an XOR capability is still required at the
                 disk array controller for reconstructing missing data
                 blocks. A duplexed Nonvolatile Storage (NVS) cache at
                 the disk array controller provides the same reliability
                 as magnetic disks and allows fast writes, i.e., writing
                 to disk is considered completed as soon as data is
                 written onto NVS. Deferring the destaging of data
                 blocks from NVS allows these blocks to be overwritten,
                 obviating unnecessary disk writes. This also allows
                 neighboring dirty blocks to be destaged in batches, so
                 that a higher disk access efficiency is attained. Disks
                 with multiple arms can also be used to make the
                 processing of RMW requests more efficient, while disks
                 with multiple R/W heads on one arm have little effect
                 on RMW requests. In addition there are alternative
                 methods to update check blocks, such as floating
                 parities, parity logging, the reconstruct write method,
                 log structured arrays, and variable scope parity
                 protection.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Thorson:2010:INc,
  author =       "Mark Thorson",
  title =        "{Internet} nuggets",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "38",
  number =       "5",
  pages =        "28--36",
  month =        dec,
  year =         "2010",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1978907.1978914",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri May 13 11:25:46 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Larus:2011:CWC,
  author =       "James R. Larus",
  title =        "The cloud will change everything",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "1",
  pages =        "1--2",
  month =        mar,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1961295.1950367",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Aug 18 13:45:25 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Yuan:2011:ISD,
  author =       "Ding Yuan and Jing Zheng and Soyeon Park and Yuanyuan
                 Zhou and Stefan Savage",
  title =        "Improving software diagnosability via log
                 enhancement",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "1",
  pages =        "3--14",
  month =        mar,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1961295.1950369",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Aug 18 13:45:25 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Veeraraghavan:2011:DPS,
  author =       "Kaushik Veeraraghavan and Dongyoon Lee and Benjamin
                 Wester and Jessica Ouyang and Peter M. Chen and Jason
                 Flinn and Satish Narayanasamy",
  title =        "{DoublePlay}: parallelizing sequential logging and
                 replay",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "1",
  pages =        "15--26",
  month =        mar,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1961295.1950370",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Aug 18 13:45:25 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Casper:2011:HAT,
  author =       "Jared Casper and Tayo Oguntebi and Sungpack Hong and
                 Nathan G. Bronson and Christos Kozyrakis and Kunle
                 Olukotun",
  title =        "Hardware acceleration of transactional memory on
                 commodity systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "1",
  pages =        "27--38",
  month =        mar,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1961295.1950372",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Aug 18 13:45:25 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Dalessandro:2011:HNC,
  author =       "Luke Dalessandro and Fran{\c{c}}ois Carouge and Sean
                 White and Yossi Lev and Mark Moir and Michael L. Scott
                 and Michael F. Spear",
  title =        "Hybrid {NOrec}: a case study in the effectiveness of
                 best effort hardware transactional memory",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "1",
  pages =        "39--52",
  month =        mar,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1961295.1950373",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Aug 18 13:45:25 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Singh:2011:EPS,
  author =       "Abhayendra Singh and Daniel Marino and Satish
                 Narayanasamy and Todd Millstein and Madan Musuvathi",
  title =        "Efficient processor support for {DRFx}, a memory model
                 with exceptions",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "1",
  pages =        "53--66",
  month =        mar,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1961295.1950375",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Aug 18 13:45:25 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Devietti:2011:RRC,
  author =       "Joseph Devietti and Jacob Nelson and Tom Bergan and
                 Luis Ceze and Dan Grossman",
  title =        "{RCDC}: a relaxed consistency deterministic computer",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "1",
  pages =        "67--78",
  month =        mar,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1961295.1950376",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Aug 18 13:45:25 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Burnim:2011:SCS,
  author =       "Jacob Burnim and George Necula and Koushik Sen",
  title =        "Specifying and checking semantic atomicity for
                 multithreaded programs",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "1",
  pages =        "79--90",
  month =        mar,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1961295.1950377",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Aug 18 13:45:25 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Volos:2011:MLP,
  author =       "Haris Volos and Andres Jaan Tack and Michael M.
                 Swift",
  title =        "{Mnemosyne}: lightweight persistent memory",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "1",
  pages =        "91--104",
  month =        mar,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1961295.1950379",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Aug 18 13:45:25 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Coburn:2011:NHM,
  author =       "Joel Coburn and Adrian M. Caulfield and Ameen Akel and
                 Laura M. Grupp and Rajesh K. Gupta and Ranjit Jhala and
                 Steven Swanson",
  title =        "{NV-Heaps}: making persistent objects fast and safe
                 with next-generation, non-volatile memories",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "1",
  pages =        "105--118",
  month =        mar,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1961295.1950380",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Aug 18 13:45:25 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Schupbach:2011:DLA,
  author =       "Adrian Sch{\"u}pbach and Andrew Baumann and Timothy
                 Roscoe and Simon Peter",
  title =        "A declarative language approach to device
                 configuration",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "1",
  pages =        "119--132",
  month =        mar,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1961295.1950382",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Aug 18 13:45:25 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ryzhyk:2011:IDD,
  author =       "Leonid Ryzhyk and John Keys and Balachandra Mirla and
                 Arun Raghunath and Mona Vij and Gernot Heiser",
  title =        "Improved device driver reliability through hardware
                 verification reuse",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "1",
  pages =        "133--144",
  month =        mar,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1961295.1950383",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Aug 18 13:45:25 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hashmi:2011:CNI,
  author =       "Atif Hashmi and Andrew Nere and James Jamal Thomas and
                 Mikko Lipasti",
  title =        "A case for neuromorphic {ISAs}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "1",
  pages =        "145--158",
  month =        mar,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1961295.1950385",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Aug 18 13:45:25 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ransford:2011:MSS,
  author =       "Benjamin Ransford and Jacob Sorber and Kevin Fu",
  title =        "{Mementos}: system support for long-running
                 computation on {RFID}-scale devices",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "1",
  pages =        "159--170",
  month =        mar,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1961295.1950386",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Aug 18 13:45:25 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Koukoumidis:2011:PC,
  author =       "Emmanouil Koukoumidis and Dimitrios Lymberopoulos and
                 Karin Strauss and Jie Liu and Doug Burger",
  title =        "Pocket cloudlets",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "1",
  pages =        "171--184",
  month =        mar,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1961295.1950387",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Aug 18 13:45:25 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Sharma:2011:BMS,
  author =       "Navin Sharma and Sean Barker and David Irwin and
                 Prashant Shenoy",
  title =        "{Blink}: managing server clusters on intermittent
                 power",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "1",
  pages =        "185--198",
  month =        mar,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1961295.1950389",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Aug 18 13:45:25 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hoffmann:2011:DKR,
  author =       "Henry Hoffmann and Stelios Sidiroglou and Michael
                 Carbin and Sasa Misailovic and Anant Agarwal and Martin
                 Rinard",
  title =        "Dynamic knobs for responsive power-aware computing",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "1",
  pages =        "199--212",
  month =        mar,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1961295.1950390",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Aug 18 13:45:25 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Liu:2011:FSD,
  author =       "Song Liu and Karthik Pattabiraman and Thomas
                 Moscibroda and Benjamin G. Zorn",
  title =        "{Flikker}: saving {DRAM} refresh-power through
                 critical data partitioning",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "1",
  pages =        "213--224",
  month =        mar,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1961295.1950391",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Aug 18 13:45:25 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Deng:2011:MAL,
  author =       "Qingyuan Deng and David Meisner and Luiz Ramos and
                 Thomas F. Wenisch and Ricardo Bianchini",
  title =        "{MemScale}: active low-power modes for main memory",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "1",
  pages =        "225--238",
  month =        mar,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1961295.1950392",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Aug 18 13:45:25 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Gao:2011:TMH,
  author =       "Qi Gao and Wenbin Zhang and Zhezhe Chen and Mai Zheng
                 and Feng Qin",
  title =        "{2ndStrike}: toward manifesting hidden concurrency
                 typestate bugs",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "1",
  pages =        "239--250",
  month =        mar,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1961295.1950394",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Aug 18 13:45:25 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Zhang:2011:CDC,
  author =       "Wei Zhang and Junghee Lim and Ramya Olichandran and
                 Joel Scherpelz and Guoliang Jin and Shan Lu and Thomas
                 Reps",
  title =        "{ConSeq}: detecting concurrency bugs through
                 sequential errors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "1",
  pages =        "251--264",
  month =        mar,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1961295.1950395",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Aug 18 13:45:25 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Chipounov:2011:SPV,
  author =       "Vitaly Chipounov and Volodymyr Kuznetsov and George
                 Candea",
  title =        "{S2E}: a platform for in-vivo multi-path analysis of
                 software systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "1",
  pages =        "265--278",
  month =        mar,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1961295.1950396",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Aug 18 13:45:25 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hofmann:2011:EOS,
  author =       "Owen S. Hofmann and Alan M. Dunn and Sangman Kim and
                 Indrajit Roy and Emmett Witchel",
  title =        "Ensuring operating system kernel integrity with
                 {OSck}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "1",
  pages =        "279--290",
  month =        mar,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1961295.1950398",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Aug 18 13:45:25 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Porter:2011:RLT,
  author =       "Donald E. Porter and Silas Boyd-Wickizer and Jon
                 Howell and Reuben Olinsky and Galen C. Hunt",
  title =        "Rethinking the library {OS} from the top down",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "1",
  pages =        "291--304",
  month =        mar,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1961295.1950399",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Aug 18 13:45:25 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Palix:2011:FLT,
  author =       "Nicolas Palix and Ga{\"e}l Thomas and Suman Saha and
                 Christophe Calv{\`e}s and Julia Lawall and Gilles
                 Muller",
  title =        "Faults in {Linux}: ten years later",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "1",
  pages =        "305--318",
  month =        mar,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1961295.1950401",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Aug 18 13:45:25 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "In 2001, Chou et al. published a study of faults found
                 by applying a static analyzer to Linux versions 1.0
                 through 2.4.1. A major result of their work was that
                 the drivers directory contained up to 7 times more of
                 certain kinds of faults than other directories. This
                 result inspired a number of development and research
                 efforts on improving the reliability of driver code.
                 Today Linux is used in a much wider range of
                 environments, provides a much wider range of services,
                 and has adopted a new development and release model.
                 What has been the impact of these changes on code
                 quality? Are drivers still a major problem?\par

                 To answer these questions, we have transported the
                 experiments of Chou et al. to Linux versions 2.6.0 to
                 2.6.33, released between late 2003 and early 2010. We
                 find that Linux has more than doubled in size during
                 this period, but that the number of faults per line of
                 code has been decreasing. And, even though drivers
                 still accounts for a large part of the kernel code and
                 contains the most faults, its fault rate is now below
                 that of other directories, such as arch (HAL) and fs
                 (file systems). These results can guide further
                 development and research efforts. To enable others to
                 continually update these results as Linux evolves, we
                 define our experimental protocol and make our checkers
                 and results available in a public archive.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Esmaeilzadeh:2011:LBL,
  author =       "Hadi Esmaeilzadeh and Ting Cao and Yang Xi and Stephen
                 M. Blackburn and Kathryn S. McKinley",
  title =        "Looking back on the language and hardware revolutions:
                 measured power, performance, and scaling",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "1",
  pages =        "319--332",
  month =        mar,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1961295.1950402",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Aug 18 13:45:25 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Nguyen:2011:SCS,
  author =       "Donald Nguyen and Keshav Pingali",
  title =        "Synthesizing concurrent schedulers for irregular
                 algorithms",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "1",
  pages =        "333--344",
  month =        mar,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1961295.1950404",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Aug 18 13:45:25 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hoang:2011:ECT,
  author =       "Giang Hoang and Robby Bruce Findler and Russ Joseph",
  title =        "Exploring circuit timing-aware language and
                 compilation",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "1",
  pages =        "345--356",
  month =        mar,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1961295.1950405",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Aug 18 13:45:25 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Farhad:2011:OAM,
  author =       "Sardar M. Farhad and Yousun Ko and Bernd Burgstaller
                 and Bernhard Scholz",
  title =        "Orchestration by approximation: mapping stream
                 programs onto multicore architectures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "1",
  pages =        "357--368",
  month =        mar,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1961295.1950406",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Aug 18 13:45:25 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Zhang:2011:FED,
  author =       "Eddy Z. Zhang and Yunlian Jiang and Ziyu Guo and Kai
                 Tian and Xipeng Shen",
  title =        "On-the-fly elimination of dynamic irregularities for
                 {GPU} computing",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "1",
  pages =        "369--380",
  month =        mar,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1961295.1950408",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Aug 18 13:45:25 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hormati:2011:SPS,
  author =       "Amir H. Hormati and Mehrzad Samadi and Mark Woh and
                 Trevor Mudge and Scott Mahlke",
  title =        "{Sponge}: portable stream programming on graphics
                 engines",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "1",
  pages =        "381--392",
  month =        mar,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1961295.1950409",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Aug 18 13:45:25 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kamruzzaman:2011:ICP,
  author =       "Md Kamruzzaman and Steven Swanson and Dean M.
                 Tullsen",
  title =        "Inter-core prefetching for multicore processors using
                 migrating helper threads",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "1",
  pages =        "393--404",
  month =        mar,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1961295.1950411",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Aug 18 13:45:25 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hayashizaki:2011:IPT,
  author =       "Hiroshige Hayashizaki and Peng Wu and Hiroshi Inoue
                 and Mauricio J. Serrano and Toshio Nakatani",
  title =        "Improving the performance of trace-based systems by
                 false loop filtering",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "1",
  pages =        "405--418",
  month =        mar,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/1961295.1950412",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Aug 18 13:45:25 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Binkert:2011:GS,
  author =       "Nathan Binkert and Bradford Beckmann and Gabriel Black
                 and Steven K. Reinhardt and Ali Saidi and Arkaprava
                 Basu and Joel Hestness and Derek R. Hower and Tushar
                 Krishna and Somayeh Sardashti and Rathijit Sen and
                 Korey Sewell and Muhammad Shoaib and Nilay Vaish and
                 Mark D. Hill and David A. Wood",
  title =        "The {\tt gem5} simulator",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "2",
  pages =        "1--7",
  month =        may,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2024716.2024718",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 1 17:35:28 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "The gem5 simulation infrastructure is the merger of
                 the best aspects of the M5 [4] and GEMS [9] simulators.
                 M5 provides a highly configurable simulation framework,
                 multiple ISAs, and diverse CPU models. GEMS complements
                 these features with a detailed and flexible memory
                 system, including support for multiple cache coherence
                 protocols and interconnect models. Currently, gem5
                 supports most commercial ISAs (ARM, ALPHA, MIPS, Power,
                 SPARC, and x86), including booting Linux on three of
                 them (ARM, ALPHA, and x86). The project is the result
                 of the combined efforts of many academic and industrial
                 institutions, including AMD, ARM, HP, MIPS, Princeton,
                 MIT, and the Universities of Michigan, Texas, and
                 Wisconsin.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Thomasian:2011:SAD,
  author =       "Alexander Thomasian",
  title =        "Survey and analysis of disk scheduling methods",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "2",
  pages =        "8--25",
  month =        may,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2024716.2024719",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 1 17:35:28 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Performance of many important computer applications
                 depends on the performance of Hard Disk Drives (HDDs).
                 Disk capacities and transfer rates have been increasing
                 rapidly, but the improvement in disk access time is
                 disappointingly slow. Caching and prefetching are two
                 method to alleviate this delay, which is 6-7 orders of
                 magnitude longer than the processor cycle time. Disk
                 scheduling is desirable when the data is not cached and
                 a disk access is required. This paper is concerned with
                 the analysis of two disk arm scheduling methods: SATF
                 (shortest access time first) which outperforms SCAN,
                 while both methods outperform FCFS scheduling. We
                 propose improvements to a recent analysis of the SCAN
                 policy and carry out an empirical investigation of SATF
                 performance to derive a relationship between the
                 queue-length and mean service time.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{K:2011:LPT,
  author =       "Thimmarayaswamy K and Mary M. Dsouza and G.
                 Varaprasad",
  title =        "Low power techniques for an {Android} based phone",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "2",
  pages =        "26--35",
  month =        may,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2024716.2024720",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 1 17:35:28 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Android is the latest trend in mobile operating
                 systems. Even though Android provides a complete set of
                 application, middleware and Linux kernel for the phone
                 applications developer, it does not fully utilize
                 several standard kernel features. This work attempts to
                 address the limitations of Android specific to power
                 management at kernel level and proposes possible
                 solutions for active and static power management in
                 Linux to overcome these limitations. The developed
                 solutions for active power management include selection
                 of suitable governor algorithm and modification of its
                 parameters and implementation of a daemon process,
                 which performs voltage and frequency scaling.
                 Application level low power techniques for Android are
                 also proposed to help application developers to
                 optimize their software.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Thorson:2011:INa,
  author =       "Mark Thorson",
  title =        "{Internet} nuggets",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "2",
  pages =        "36--52",
  month =        may,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2024716.2024722",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 1 17:35:28 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hashmi:2011:AAF,
  author =       "Atif Hashmi and Hugues Berry and Olivier Temam and
                 Mikko Lipasti",
  title =        "Automatic abstraction and fault tolerance in cortical
                 microachitectures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "3",
  pages =        "1--10",
  month =        jun,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2024723.2000066",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Sep 5 17:15:11 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Choudhary:2011:FCS,
  author =       "Niket K. Choudhary and Salil V. Wadhavkar and Tanmay
                 A. Shah and Hiran Mayukh and Jayneel Gandhi and Brandon
                 H. Dwiel and Sandeep Navada and Hashem H. Najaf-abadi
                 and Eric Rotenberg",
  title =        "{FabScalar}: composing synthesizable {RTL} designs of
                 arbitrary cores within a canonical superscalar
                 template",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "3",
  pages =        "11--22",
  month =        jun,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2024723.2000067",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Sep 5 17:15:11 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Gunadi:2011:CCR,
  author =       "Erika Gunadi and Mikko H. Lipasti",
  title =        "{CRIB}: consolidated rename, issue, and bypass",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "3",
  pages =        "23--32",
  month =        jun,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2024723.2000068",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Sep 5 17:15:11 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Agarwal:2011:FIF,
  author =       "Rishi Agarwal and Josep Torrellas",
  title =        "{FlexBulk}: intelligently forming atomic blocks in
                 blocked-execution multiprocessors to minimize
                 squashes",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "3",
  pages =        "33--44",
  month =        jun,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2024723.2000070",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Sep 5 17:15:11 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kwon:2011:VPA,
  author =       "Youngjin Kwon and Changdae Kim and Seungryoul Maeng
                 and Jaehyuk Huh",
  title =        "Virtualizing performance asymmetric multi-core
                 systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "3",
  pages =        "45--56",
  month =        jun,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2024723.2000071",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Sep 5 17:15:11 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Sanchez:2011:VSE,
  author =       "Daniel Sanchez and Christos Kozyrakis",
  title =        "{Vantage}: scalable and efficient fine-grain cache
                 partitioning",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "3",
  pages =        "57--68",
  month =        jun,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2024723.2000073",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Sep 5 17:15:11 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Mishra:2011:ACI,
  author =       "Asit K. Mishra and Xiangyu Dong and Guangyu Sun and
                 Yuan Xie and N. Vijaykrishnan and Chita R. Das",
  title =        "Architecting on-chip interconnects for stacked {$3$D}
                 {STT-RAM} caches in {CMPs}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "3",
  pages =        "69--80",
  month =        jun,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2024723.2000074",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Sep 5 17:15:11 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Gaur:2011:BIA,
  author =       "Jayesh Gaur and Mainak Chaudhuri and Sreenivas
                 Subramoney",
  title =        "Bypass and insertion algorithms for exclusive
                 last-level caches",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "3",
  pages =        "81--92",
  month =        jun,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2024723.2000075",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Sep 5 17:15:11 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Cuesta:2011:IED,
  author =       "Blas A. Cuesta and Alberto Ros and Mar{\'\i}a E.
                 G{\'o}mez and Antonio Robles and Jos{\'e} F. Duato",
  title =        "Increasing the effectiveness of directory caches by
                 deactivating coherence for private memory blocks",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "3",
  pages =        "93--104",
  month =        jun,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2024723.2000076",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Sep 5 17:15:11 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Oh:2011:TSM,
  author =       "Jungju Oh and Milos Prvulovic and Alenka Zajic",
  title =        "{TLSync}: support for multiple fast barriers using
                 on-chip transmission lines",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "3",
  pages =        "105--116",
  month =        jun,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2024723.2000078",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Sep 5 17:15:11 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Crago:2011:OEM,
  author =       "Neal Clayton Crago and Sanjay Jeram Patel",
  title =        "{OUTRIDER}: efficient memory latency tolerance with
                 decoupled strands",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "3",
  pages =        "117--128",
  month =        jun,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2024723.2000079",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Sep 5 17:15:11 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Lee:2011:ETB,
  author =       "Yunsup Lee and Rimas Avizienis and Alex Bishara and
                 Richard Xia and Derek Lockhart and Christopher Batten
                 and Krste Asanovi{\'c}",
  title =        "Exploring the tradeoffs between programmability and
                 efficiency in data-parallel accelerators",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "3",
  pages =        "129--140",
  month =        jun,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2024723.2000080",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Sep 5 17:15:11 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ebrahimi:2011:PAS,
  author =       "Eiman Ebrahimi and Chang Joo Lee and Onur Mutlu and
                 Yale N. Patt",
  title =        "Prefetch-aware shared resource management for
                 multi-core systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "3",
  pages =        "141--152",
  month =        jun,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2024723.2000081",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Sep 5 17:15:11 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Agarwal:2011:RSC,
  author =       "Rishi Agarwal and Pranav Garg and Josep Torrellas",
  title =        "Rebound: scalable checkpointing for coherent shared
                 memory",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "3",
  pages =        "153--164",
  month =        jun,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2024723.2000083",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Sep 5 17:15:11 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Greathouse:2011:DDS,
  author =       "Joseph L. Greathouse and Zhiqiang Ma and Matthew I.
                 Frank and Ramesh Peri and Todd Austin",
  title =        "Demand-driven software race detection using hardware
                 performance counters",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "3",
  pages =        "165--176",
  month =        jun,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2024723.2000084",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Sep 5 17:15:11 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Chhabra:2011:NSN,
  author =       "Siddhartha Chhabra and Yan Solihin",
  title =        "{i-NVMM}: a secure non-volatile main memory system
                 with incremental encryption",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "3",
  pages =        "177--188",
  month =        jun,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2024723.2000086",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Sep 5 17:15:11 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Tiwari:2011:CUM,
  author =       "Mohit Tiwari and Jason K. Oberg and Xun Li and
                 Jonathan Valamehr and Timothy Levin and Ben Hardekopf
                 and Ryan Kastner and Frederic T. Chong and Timothy
                 Sherwood",
  title =        "Crafting a usable microkernel, processor, and {I/O}
                 system with strict and provable information flow
                 security",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "3",
  pages =        "189--200",
  month =        jun,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2024723.2000087",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Sep 5 17:15:11 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Nomura:2011:SDP,
  author =       "Shuou Nomura and Matthew D. Sinclair and Chen-Han Ho
                 and Venkatraman Govindaraju and Marc de Kruijf and
                 Karthikeyan Sankaralingam",
  title =        "Sampling $+$ {DMR}: practical and low-overhead
                 permanent fault detection",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "3",
  pages =        "201--212",
  month =        jun,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2024723.2000089",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Sep 5 17:15:11 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Sudhakrishnan:2011:REB,
  author =       "Sangeetha Sudhakrishnan and Rigo Dicochea and Jose
                 Renau",
  title =        "Releasing efficient beta cores to market early",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "3",
  pages =        "213--222",
  month =        jun,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2024723.2000090",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Sep 5 17:15:11 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Manoochehri:2011:CCP,
  author =       "Mehrtash Manoochehri and Murali Annavaram and Michel
                 Dubois",
  title =        "{CPPC}: correctable parity protected cache",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "3",
  pages =        "223--234",
  month =        jun,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2024723.2000091",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Sep 5 17:15:11 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Gebhart:2011:EEM,
  author =       "Mark Gebhart and Daniel R. Johnson and David Tarjan
                 and Stephen W. Keckler and William J. Dally and Erik
                 Lindholm and Kevin Skadron",
  title =        "Energy-efficient mechanisms for managing thread
                 context in throughput processors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "3",
  pages =        "235--246",
  month =        jun,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2024723.2000093",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Sep 5 17:15:11 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Yu:2011:SDH,
  author =       "Wing-kei S. Yu and Ruirui Huang and Sarah Q. Xu and
                 Sung-En Wang and Edwin Kan and G. Edward Suh",
  title =        "{SRAM--DRAM} hybrid memory with applications to
                 efficient register files in fine-grained
                 multi-threading",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "3",
  pages =        "247--258",
  month =        jun,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2024723.2000094",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Sep 5 17:15:11 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Fu:2011:ATM,
  author =       "Binzhang Fu and Yinhe Han and Jun Ma and Huawei Li and
                 Xiaowei Li",
  title =        "An abacus turn model for time\slash space-efficient
                 reconfigurable routing",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "3",
  pages =        "259--270",
  month =        jun,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2024723.2000096",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Sep 5 17:15:11 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Carpenter:2011:CGS,
  author =       "Aaron Carpenter and Jianyun Hu and Jie Xu and Michael
                 Huang and Hui Wu",
  title =        "A case for globally shared-medium on-chip
                 interconnect",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "3",
  pages =        "271--282",
  month =        jun,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2024723.2000097",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Sep 5 17:15:11 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Tang:2011:IMS,
  author =       "Lingjia Tang and Jason Mars and Neil Vachharajani and
                 Robert Hundt and Mary Lou Soffa",
  title =        "The impact of memory subsystem resource sharing on
                 datacenter applications",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "3",
  pages =        "283--294",
  month =        jun,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2024723.2000099",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Sep 5 17:15:11 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Yoon:2011:AGM,
  author =       "Doe Hyun Yoon and Min Kyu Jeong and Mattan Erez",
  title =        "Adaptive granularity memory systems: a tradeoff
                 between storage efficiency and throughput",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "3",
  pages =        "295--306",
  month =        jun,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2024723.2000100",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Sep 5 17:15:11 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Barr:2011:SMS,
  author =       "Thomas W. Barr and Alan L. Cox and Scott Rixner",
  title =        "{SpecTLB}: a mechanism for speculative address
                 translation",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "3",
  pages =        "307--318",
  month =        jun,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2024723.2000101",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Sep 5 17:15:11 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Meisner:2011:PMO,
  author =       "David Meisner and Christopher M. Sadler and Luiz
                 Andr{\'e} Barroso and Wolf-Dietrich Weber and Thomas F.
                 Wenisch",
  title =        "Power management of online data-intensive services",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "3",
  pages =        "319--330",
  month =        jun,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2024723.2000103",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Sep 5 17:15:11 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Biswas:2011:FFF,
  author =       "Susmit Biswas and Mohit Tiwari and Timothy Sherwood
                 and Luke Theogarajan and Frederic T. Chong",
  title =        "Fighting fire with fire: modeling the datacenter-scale
                 effects of targeted superlattice thermal management",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "3",
  pages =        "331--340",
  month =        jun,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2024723.2000104",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Sep 5 17:15:11 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Govindan:2011:BLT,
  author =       "Sriram Govindan and Anand Sivasubramaniam and Bhuvan
                 Urgaonkar",
  title =        "Benefits and limitations of tapping into stored energy
                 for datacenters",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "3",
  pages =        "341--352",
  month =        jun,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2024723.2000105",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Sep 5 17:15:11 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Demme:2011:RIA,
  author =       "John Demme and Simha Sethumadhavan",
  title =        "Rapid identification of architectural bottlenecks via
                 precise event counting",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "3",
  pages =        "353--364",
  month =        jun,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2024723.2000107",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Sep 5 17:15:11 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Esmaeilzadeh:2011:DSE,
  author =       "Hadi Esmaeilzadeh and Emily Blem and Renee {St. Amant}
                 and Karthikeyan Sankaralingam and Doug Burger",
  title =        "Dark silicon and the end of multicore scaling",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "3",
  pages =        "365--376",
  month =        jun,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2024723.2000108",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Sep 5 17:15:11 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Sun:2011:MME,
  author =       "Guangyu Sun and Christopher J. Hughes and Changkyu Kim
                 and Jishen Zhao and Cong Xu and Yuan Xie and Yen-Kuang
                 Chen",
  title =        "{Moguls}: a model to explore the memory hierarchy for
                 bandwidth improvements",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "3",
  pages =        "377--388",
  month =        jun,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2024723.2000109",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Sep 5 17:15:11 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Mishra:2011:CHC,
  author =       "Asit K. Mishra and N. Vijaykrishnan and Chita R. Das",
  title =        "A case for heterogeneous on-chip interconnects for
                 {CMPs}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "3",
  pages =        "389--400",
  month =        jun,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2024723.2000111",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Sep 5 17:15:11 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Grot:2011:KNH,
  author =       "Boris Grot and Joel Hestness and Stephen W. Keckler
                 and Onur Mutlu",
  title =        "{Kilo-NOC}: a heterogeneous network-on-chip
                 architecture for scalability and service guarantees",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "3",
  pages =        "401--412",
  month =        jun,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2024723.2000112",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Sep 5 17:15:11 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ma:2011:DER,
  author =       "Sheng Ma and Natalie Enright Jerger and Zhiying Wang",
  title =        "{DBAR}: an efficient routing algorithm to support
                 multiple concurrent applications in networks-on-chip",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "3",
  pages =        "413--424",
  month =        jun,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2024723.2000113",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Sep 5 17:15:11 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Udipi:2011:CMC,
  author =       "Aniruddha N. Udipi and Naveen Muralimanohar and Rajeev
                 Balasubramonian and Al Davis and Norman P. Jouppi",
  title =        "Combining memory and a controller with photonics
                 through {$3$D}-stacking to enable scalable and
                 energy-efficient systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "3",
  pages =        "425--436",
  month =        jun,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2024723.2000115",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Sep 5 17:15:11 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Binkert:2011:ROF,
  author =       "Nathan Binkert and Al Davis and Norman P. Jouppi and
                 Moray McLaren and Naveen Muralimanohar and Robert
                 Schreiber and Jung Ho Ahn",
  title =        "The role of optics in future high radix switch
                 design",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "3",
  pages =        "437--448",
  month =        jun,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2024723.2000116",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Sep 5 17:15:11 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ma:2011:SPC,
  author =       "Kai Ma and Xue Li and Ming Chen and Xiaorui Wang",
  title =        "Scalable power control for many-core architectures
                 running multi-threaded applications",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "3",
  pages =        "449--460",
  month =        jun,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2024723.2000117",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Sep 5 17:15:11 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Alameldeen:2011:EEC,
  author =       "Alaa R. Alameldeen and Ilya Wagner and Zeshan Chishti
                 and Wei Wu and Chris Wilkerson and Shih-Lien Lu",
  title =        "Energy-efficient cache design using variable-strength
                 error-correcting codes",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "3",
  pages =        "461--472",
  month =        jun,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2024723.2000118",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Sep 5 17:15:11 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Barroso:2011:WSC,
  author =       "Luiz Andre Barroso",
  title =        "Warehouse-Scale Computing: Entering the Teenage
                 Decade",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "3",
  pages =        "??--??",
  month =        jun,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2024723.2019527",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Sep 5 17:15:11 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ferrucci:2011:IWD,
  author =       "David A. Ferrucci",
  title =        "{IBM}'s {Watson\slash DeepQA}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "3",
  pages =        "??--??",
  month =        jun,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2024723.2019525",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Sep 5 17:15:11 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kannan:2011:ARH,
  author =       "Ravi Kannan",
  title =        "Algorithms: Recent Highlights and Challenges",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "3",
  pages =        "??--??",
  month =        jun,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2024723.2019526",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Sep 5 17:15:11 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Leeser:2011:CWP,
  author =       "Miriam Leeser and Devon Yablonski and Dana Brooks and
                 Laurie Smith King",
  title =        "The challenges of writing portable, correct and high
                 performance libraries for {GPUs}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "4",
  pages =        "2--7",
  month =        sep,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2082156.2082158",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Dec 20 17:53:58 MST 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Graphics Processing Units (GPUs) are widely used to
                 accelerate scientific applications. Many successes have
                 been reported with speedups of two or three orders of
                 magnitude over serial implementations of the same
                 algorithms. These speedups typically pertain to a
                 specific implementation with fixed parameters mapped to
                 a specific hardware implementation. The implementations
                 are not designed to be easily ported to other GPUs,
                 even from the same manufacturer. When target hardware
                 changes, the application must be re-optimized. In this
                 paper we address a different problem. We aim to deliver
                 working, efficient GPU code in a library that is
                 downloaded and run by many different users.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Tsoi:2011:PPO,
  author =       "Kuen Hung Tsoi and Wayne Luk",
  title =        "Power profiling and optimization for heterogeneous
                 multi-core systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "4",
  pages =        "8--13",
  month =        sep,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2082156.2082159",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Dec 20 17:53:58 MST 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Processing speed and energy efficiency are two of the
                 most critical issues for computer systems. This paper
                 presents a systematic approach for profiling the power
                 and performance characteristics of application
                 targeting heterogeneous multi-core computing platforms.
                 Our approach enables rapid and automated design space
                 exploration involving optimisation of workload
                 distribution for systems with accelerators such as
                 FPGAs and GPUs. We demonstrate that, with minor
                 modification to the design, it is possible to estimate
                 performance and power efficiency trade off to identify
                 optimized workload distribution.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Georgescu:2011:GAC,
  author =       "Serban Georgescu and Peter Chow",
  title =        "{GPU} accelerated {CAE} using open solvers and the
                 cloud",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "4",
  pages =        "14--19",
  month =        sep,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2082156.2082161",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Dec 20 17:53:58 MST 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "After more than five years since GPUs were first used
                 as accelerators for general scientific computations,
                 the field of General Purpose GPU computing or GPGPU has
                 finally reached mainstream. Developers have now access
                 to a mature hardware and software ecosystem. On the
                 software side, several major open-source packages now
                 support GPU acceleration while on the hardware side
                 cloud-based solutions provide a simple way to access
                 powerful machines with the latest GPUs at low cost. In
                 this context, we look at the GPU acceleration of CAE,
                 with a focus on the matrix solvers. We compare the
                 performance that can be achieved using the open-source
                 solver package PETSc ran on GPU-enabled Amazon EC2
                 hardware with that of an optimized legacy FEM code ran
                 on a last generation 12-core blade server.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Chen:2011:DSE,
  author =       "Junying Chen and Billy Y. S. Yiu and Brandon K.
                 Hamilton and Alfred C. H. Yu and Hayden K.-H. So",
  title =        "Design space exploration of adaptive beamforming
                 acceleration for bedside and portable medical
                 ultrasound imaging",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "4",
  pages =        "20--25",
  month =        sep,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2082156.2082162",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Dec 20 17:53:58 MST 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "The use of adaptive beamforming is a viable solution
                 to provide high-resolution real-time medical ultrasound
                 imaging. However, the increase in image resolution
                 comes at an expense of a significant increase in
                 compute requirement over conventional algorithms. In a
                 bedside diagnosis setting where plug-in power is
                 available, GPUs are promising accelerators to address
                 the processing demand. However, in the case of
                 point-of-care diagnostics where portable ultrasound
                 imaging devices must be used, alternative
                 power-efficient computer systems must be employed,
                 possibly at the expense of lower image resolution in
                 order to maintain real-time performance. This paper
                 presents an initial design space exploration on viable
                 compute architectures that might address the
                 drastically different requirements between bedside and
                 portable medical ultrasound imaging systems using
                 adaptive beamforming.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Dohi:2011:GIO,
  author =       "Keisuke Dohi and Yuichiro Shibata and Kiyoshi Oguri
                 and Takafumi Fujimoto",
  title =        "{GPU} implementation and optimization of
                 electromagnetic simulation using the {FDTD} method for
                 antenna designing",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "4",
  pages =        "26--31",
  month =        sep,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2082156.2082163",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Dec 20 17:53:58 MST 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "This paper describes electromagnetical field
                 simulation using the 3D-FDTD method for antenna
                 designing on a CUDA compatible GPU. We use the Split
                 Perfectly Matched Layer as an absorbing boundary
                 condition. As is well known, the 3D-FDTD method is a
                 kind of stencil computation and is considered better at
                 GPU implementation. In order to find the best blocking
                 size for the target GPU architecture, we empirically
                 explore a design space of blocking size. We also
                 propose a kernel fusing method as one of the efficient
                 optimization methods, which improves the total
                 performance about 10\% at the cost of a small increase
                 in memory usage.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Nagatsuka:2011:CER,
  author =       "Tomoyuki Nagatsuka and Yoshito Sakaguchi and Takayuki
                 Matsumura and Kenji Kise",
  title =        "{CoreSymphony}: an efficient reconfigurable multi-core
                 architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "4",
  pages =        "32--37",
  month =        sep,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2082156.2082165",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Dec 20 17:53:58 MST 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "This paper describes CoreSymphony, a cooperative and
                 reconfigurable superscalar processor architecture that
                 improves single-thread performance in chip
                 multiprocessor. CoreSymphony enables some narrow-issue
                 cores to be fused into a single wide-issue core. In
                 this paper, we describe the problems associated with
                 achieving the cooperative superscalar processor. We
                 then describe techniques by which to overcome these
                 problems. The evaluation results obtained using
                 SPEC2006 benchmarks indicate that four-core fusion
                 achieves 88\% higher IPC than an individual core.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Takamaeda-Yamazaki:2011:FBS,
  author =       "Shinya Takamaeda-Yamazaki and Ryosuke Sasakawa and
                 Yoshito Sakaguchi and Kenji Kise",
  title =        "An {FPGA}-based scalable simulation accelerator for
                 tile architectures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "4",
  pages =        "38--43",
  month =        sep,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2082156.2082166",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Dec 20 17:53:58 MST 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "FPGA-based simulation systems can simulate processor
                 behavior in realistic time. In order to practically
                 simulate tile many-core architectures, we propose
                 ScalableCore for prototyping system development using
                 multiple FPGAs. In this paper, we present an FPGA-based
                 platform called ScalableCore system 1.1, which consists
                 of several simulation tiles named ScalableCore Units.
                 Each tile is connected to four neighbor tiles via
                 interface boards called ScalableCore Boards, and so
                 increasing the target number of cores is easy. We also
                 describe useful techniques by which to achieve high
                 scalability of simulation and to implement complicated
                 hardware functions on an FPGA. The developed system
                 simulates the behavior of a tile architecture with DMA
                 communications and NoC 14.2 times faster than a
                 corresponding software-based functional simulator
                 running on a standard computer with an Intel Core2Duo
                 processor.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Sano:2011:DSP,
  author =       "Kentaro Sano and Satoru Yamamoto and Yoshiaki
                 Hatsuda",
  title =        "Domain-specific programmable design of scalable
                 streaming-array for power-efficient stencil
                 computation",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "4",
  pages =        "44--49",
  month =        sep,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2082156.2082168",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Dec 20 17:53:58 MST 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "This paper presents the domain-specific programmable
                 design of custom computing machines for
                 high-performance stencil computation. Stencil
                 computation is one of the typical kernels in scientific
                 computations, however its low operational-intensity
                 makes the sustained performance limited by memory
                 bandwidth on recent microprocessors and GPUs. So far we
                 have proposed a scalable streaming-array (SSA) of
                 processing elements, which provides almost linear
                 scalability by increasing FPGAs with a constant
                 externalmemory bandwidth. In order to facilitate custom
                 computing and efficiently utilize hardware resources
                 for various and complex stencil-computations, we design
                 programmable SSA with limited but necessary
                 functionality. We show the design concept, the
                 programmable structure and the SIMD instruction set for
                 SSA.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Akamine:2011:IOE,
  author =       "Takayuki Akamine and Kenta Inakagata and Yasunori
                 Osana and Naoyuki Fujita and Hideharu Amano",
  title =        "An implementation of out-of-order execution system for
                 acceleration of computational fluid dynamics on
                 {FPGAs}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "4",
  pages =        "50--55",
  month =        sep,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2082156.2082169",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Dec 20 17:53:58 MST 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "CFD is an important tool for designing aircraft
                 components. FaSTAR is one of the most recent CFD
                 program package with various solvers and automatic
                 generation of grid data. However, FaSTAR is difficult
                 to be executed in parallel machines because of its
                 irregular data structure. Here, the surface integral
                 module, one of cores of FaSTAR is implemented in an
                 FPGA for future acceleration using a platform FLOPS-2D.
                 However, even with hardware execution, the pipeline
                 module suffers from frequent stalls caused by irregular
                 and successive memory access. In order to rid of the
                 problem, a data controller for Out-Of-Order execution
                 was designed and implemented.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Liu:2011:EAH,
  author =       "Haisheng Liu and Smail Niar and Yassin El-Hillali and
                 Atika Rivenq",
  title =        "Embedded architecture with hardware accelerator for
                 target recognition in driver assistance system",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "4",
  pages =        "56--59",
  month =        sep,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2082156.2082170",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Dec 20 17:53:58 MST 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "This paper presents a new Radar-based recognition
                 system, which is able to identify obstacles during a
                 vehicle movement. Obstacles recognition gives the
                 benefits of avoiding false alarms and allows generating
                 alarms that take into account the identification of the
                 obstacle in front of the vehicle. In this paper, we
                 first identify hotspots in the target recognition
                 application. Then, we propose an optimized version of
                 the multiple target recognition algorithm to respect
                 the real time constraints of the application while
                 simplifying the underlying hardware platform. We also
                 propose a flexible embedded architecture with hardware
                 accelerator that supports the proposed algorithm.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Pell:2011:SEF,
  author =       "Oliver Pell and Oskar Mencer",
  title =        "Surviving the end of frequency scaling with
                 reconfigurable dataflow computing",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "4",
  pages =        "60--65",
  month =        sep,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2082156.2082172",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Dec 20 17:53:58 MST 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Over the past decade x86 processors have come to
                 dominate the world's largest supercomputers. However in
                 the future conventional multicore processors are
                 unlikely to be able to deliver the necessary
                 performance per \$ and per W to achieve exascale
                 performance. Heterogeneous computing is emerging as a
                 powerful alternative to conventional multi-core to help
                 address these challenges. In this paper we describe our
                 approach to Maximum Performance Computing --- building
                 application-specific computers which complement
                 conventional x86 processors with high performance
                 dataflow engines implemented on FPGA to provide
                 10--100$ \times $ improvements in performance and
                 performance/W.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Balevic:2011:KAD,
  author =       "Ana Balevic and Bart Kienhuis",
  title =        "{KPN2GPU}: an approach for discovery and exploitation
                 of fine-grain data parallelism in process networks",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "4",
  pages =        "66--71",
  month =        sep,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2082156.2082173",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Dec 20 17:53:58 MST 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "With advances in manycore and accelerator
                 architectures, the high performance and embedded spaces
                 are rapidly converging. Emerging architectures feature
                 different forms of parallelism. The Polyhedral
                 Processes Networks (PPNs) are a proven model of choice
                 for automated generation of pipeline and task parallel
                 programs from sequential source code, however data
                 parallelism is not addressed. In this paper, we present
                 a systematic approach for identification and extraction
                 of fine grain data parallelism from the PPN
                 specification. The approach is implemented in a tool,
                 called kpn2gpu, which produces fine-grain data parallel
                 CUDA kernels for graphics processing units (GPUs).",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Akagic:2011:HSC,
  author =       "Amila Akagi{\'c} and Hideharu Amano",
  title =        "High speed {CRC} with 64-bit generator polynomial on
                 an {FPGA}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "4",
  pages =        "72--77",
  month =        sep,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2082156.2082175",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Dec 20 17:53:58 MST 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Deployment of jumbo frame sizes beyond 9000 bytes for
                 storage systems is limited by 32-bit Cyclic Redundancy
                 Checks used by a network protocol. In order to overcome
                 this limitation we study possibility of using 64-bit
                 polynomials in software and hardware, by using fastest
                 multiple lookup tables algorithms for generating CRCs.
                 CRC is a sequential process, thus the software based
                 solutions are limited in throughput by speed and
                 architectural improvements of a single CPU. We study
                 tradeoff between using distributed LUTs and embedded
                 BRAM in hardware implementations. Our results show that
                 BRAM-based approach is the fastest hardware
                 implementation, reaching maximum of 347.37 Gbps while
                 processing 1024 bits at a time, which is 606x faster
                 than the software implementation of the same algorithm
                 running on Xeon 3.2 GHz with 2 MB of L2 cache.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Yang:2011:BPR,
  author =       "Shufan Yang and T. M. McGinnity",
  title =        "A biologically plausible real-time spiking neuron
                 simulation environment based on a multiple-{FPGA}
                 platform",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "4",
  pages =        "78--81",
  month =        sep,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2082156.2082176",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Dec 20 17:53:58 MST 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Neurological research has revealed that neurons encode
                 information in the timing of spikes. Spiking neural
                 network simulations are a flexible and powerful method
                 for investigating the behaviour of such neuronal
                 systems. The spiking neuron models which are used in
                 simulations can be described mathematically, but the
                 continuous time involved in mathematical models needs
                 to be replaced by discrete time steps. An alternative
                 approach, hardware implementation, provides the
                 possibility of generating independent spikes precisely
                 and simultaneously output spike waves in real
                 biological time, under the premise that the spiking
                 neural network implemented in hardware can take full
                 advantage of hardware-timed speed and reliability.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Sawada:2011:PCW,
  author =       "Hiroomi Sawada and Morihiro Kuga and Motoki Amagasaki
                 and Masahiro Iida and Toshinori Sueyoshi",
  title =        "Parallelization of the channel width search for {FPGA}
                 routing",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "4",
  pages =        "82--85",
  month =        sep,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2082156.2082177",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Dec 20 17:53:58 MST 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Tanabe:2011:SFB,
  author =       "Shoji Tanabe and Takuya Nagashima and Yoshiki
                 Yamaguchi",
  title =        "A study of an {FPGA} based flexible {SIMD} processor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "4",
  pages =        "86--89",
  month =        sep,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2082156.2082179",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Dec 20 17:53:58 MST 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Trouve:2011:ADA,
  author =       "Antoine Trouve and Kazuaki Murakami",
  title =        "Augmenting {DR-ASIP} flexibility through multi-mode
                 custom instructions",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "4",
  pages =        "90--93",
  month =        sep,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2082156.2082180",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Dec 20 17:53:58 MST 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "This paper introduces a simple method called multimode
                 custom instructions, which aims at reducing the power
                 consumption of the register file of tightly coupled
                 dynamically reconfigurable application specific
                 instruction set processors (DR-ASIPs). To this end, it
                 proposes to divide custom instructions into two sets
                 depending on criteria related to their size,
                 distribution and reuse rate. Performance is measured on
                 a RISC DR-ASIP with a subset of MiBench using an
                 original automatic custom instruction generator from
                 assembly based on the dancing link algorithm.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kubota:2011:MWS,
  author =       "Shinya Kubota and Minoru Watanabe",
  title =        "A {MEMS} writer system embedded for a programmable
                 optically reconfigurable gate array",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "4",
  pages =        "94--97",
  month =        sep,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2082156.2082181",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Dec 20 17:53:58 MST 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Fousek:2011:AFC,
  author =       "Jan Fousek and Ji{\v{r}}i Filipovi{\v{c}} and
                 Matu{\v{s}} Madzin",
  title =        "Automatic fusions of {CUDA--GPU} kernels for parallel
                 map",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "4",
  pages =        "98--99",
  month =        sep,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2082156.2082183",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Dec 20 17:53:58 MST 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "When implementing a function mapping on the
                 contemporary GPU, several contradictory performance
                 factors affecting distribution of computation into GPU
                 kernels have to be balanced. A decomposition-fusion
                 scheme suggests to decompose the computational problem
                 to be solved by several simple functions implemented as
                 standalone kernels and to fuse some of these functions
                 later into more complex kernels to improve memory
                 locality. In this paper, a prototype of
                 source-to-source compiler automating the fusion phase
                 is presented and the impact of fusions generated by the
                 compiler as well as compiler efficiency is
                 experimentally evaluated.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Matsunobu:2011:DCE,
  author =       "Kohei Matsunobu and Keisuke Dohi and Yuichiro Shibata
                 and Kiyoshi Oguri",
  title =        "A discussion on calculating eigenvalues of real
                 symmetric tridiagonal matrices on a {GPU}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "4",
  pages =        "100--101",
  month =        sep,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2082156.2082184",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Dec 20 17:53:58 MST 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "While GPUs are attracting attention as an accelerator
                 in wide-ranged application areas, compatibility between
                 the architecture and selected algorithm is important to
                 effectively bring out their potential performance. This
                 paper focuses on eigenvalue calculation from a given
                 real symmetric tridiagonal matrix and compares GPU
                 implementations for the QR method and the bisection
                 method. Implementation for a total of four different
                 GPU architectures are shown and compared to reveal the
                 affinity between algorithms and architectures.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Meyer:2011:MRP,
  author =       "Dominik Meyer and Bernd Klauer",
  title =        "Multicore reconfiguration platform an alternative to
                 {RAMPSoC}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "4",
  pages =        "102--103",
  month =        sep,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2082156.2082185",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Dec 20 17:53:58 MST 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "The current state of the art in processor performance
                 improvement is multicore-processor systems. These
                 systems offer a number of homogeneous and static
                 processor cores for the parallel distribution of
                 computational tasks. A novel idea in this research
                 field is introduced by the Runtime Adaptive
                 Multi-Processor System-on- Chip (RAMPSoC) approach. It
                 uses a dynamic and partial reconfigurable system to
                 offer a heterogeneous multicore-processor system. It is
                 runtime adaptable to applications needs and provides a
                 high degree of freedom for system design and task
                 distribution. The continuation of this idea is the
                 Multicore Reconfiguration Platform (MRP) presented in
                 this paper.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Bonamy:2011:PLI,
  author =       "Robin Bonamy and Daniel Chillet and Olivier Sentieys
                 and Sebastien Bilavarn",
  title =        "Parallelism Level Impact on Energy Consumption in
                 Reconfigurable Devices",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "4",
  pages =        "104--105",
  month =        sep,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2082156.2082186",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Dec 20 17:53:58 MST 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Nowadays, System-on-Chip architectures are composed of
                 several execution resources which support complex
                 applications. As it shares silicon area and limits the
                 cost of the global circuit, the embedding of a
                 reconfigurable resource in these SoC provides
                 flexibility to the hardware. In this case, several
                 implementations of the same algorithm, offering
                 different characteristics, can be considered in order
                 to optimize performances. In general, the tasks mapped
                 on reconfigurable resources are algorithms that can be
                 defined through several levels of parallelism.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Agyeman:2011:PAO,
  author =       "Michael Opoku Agyeman and Ali Ahmadinia",
  title =        "Power and area optimisation in heterogeneous {$3$D}
                 networks-on-chip architectures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "4",
  pages =        "106--107",
  month =        sep,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2082156.2082187",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Dec 20 17:53:58 MST 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Three dimensional Network-on-Chip (3D NoC)
                 architectures have evolved with a lot of interest to
                 address the on-chip communication delays of modern SoC
                 systems. However, the vertical interconnections between
                 layers is more power and area hungry compared to 2D
                 interconnections. In this paper we propose area
                 efficient and low power heterogeneous NoC
                 architectures, which combines both the power and
                 performance benefits of 2D routers and 3D NoC-bus
                 hybrid router architectures in 3D mesh topologies.
                 Experimental results show a negligible penalty of up to
                 5\% in average packet latency of 3D homogeneous NoC
                 with bus hybrid routers.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Thorson:2011:INb,
  author =       "Mark Thorson",
  title =        "{Internet} nuggets",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "4",
  pages =        "108--117",
  month =        sep,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2082156.2082189",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Dec 20 17:53:58 MST 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Das:2011:HSR,
  author =       "Malay Das and Amitabha Sinha and Nishant Kumar Giri",
  title =        "High speed residue number system ({RNS}) based {FIR}
                 filter using distributed arithmetic ({DA})",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "5",
  pages =        "1--4",
  month =        dec,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2093339.2093341",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Mar 15 14:07:10 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/fparith.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Chakraborty:2011:CBS,
  author =       "Anindita Chakraborty and Amitabha Sinha",
  title =        "Conversion of binary to single-term triple base
                 numbers for {DSP} applications",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "5",
  pages =        "5--11",
  month =        dec,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2093339.2093342",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Mar 15 14:07:10 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/fparith.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "In this paper high speed Residue Number System (RNS)
                 based FIR filter using Distributed Arithmetic (DA) is
                 proposed. The proposed architecture uses the module set
                 having the value of numbers as small as possible. In
                 case of using Distributed Arithmetic in FIR filter; the
                 size of LUTs gets increased exponentially with the
                 increase of tap of the filter. Here care has been taken
                 so that sizes of LUTs do not get increased. The
                 proposed architecture is designed using Verilog HDL; a
                 popular hardware description language [9]. The design
                 is synthesized with ISE 10.1 and implemented on
                 Xilinx's Virtex-4. The proposed architecture is also
                 compared with conventional RNS-DA FIR filter. The
                 results show that the proposed architecture can
                 implement FIR filter with high speed.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Singha:2011:NAF,
  author =       "Satrughna Singha and Aniruddha Ghosh and Amitabha
                 Sinha",
  title =        "A new architecture for {FPGA} based implementation of
                 conversion of binary to double base number system
                 ({DBNS}) using parallel search technique",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "5",
  pages =        "12--18",
  month =        dec,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2093339.2093343",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Mar 15 14:07:10 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/fparith.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Non-binary number systems are increasingly gaining
                 popularity in signal processing applications for their
                 capabilities of handling arithmetic operations
                 efficiently. One such number system, ``Double Base
                 Number System (DBNS)'' has gained attention to many
                 researchers for it's capability of performing
                 multiplication operation efficiently. Recently,
                 ``Triple Base Number System (TBNS)'' has been
                 introduced which shows better performance over DBNS for
                 higher bit operations in terms of speed, hardware
                 complexity and power dissipation. However, the
                 advantages of TBNS systems cannot be exploited due to
                 substantial overhead of conversion from binary to TBNS.
                 Keeping this issue in view, in this paper, a novel
                 architecture has been proposed for high performance
                 binary to TBNS conversion. Efficiency of this
                 conversion scheme has been dealt with in details and
                 experimental results and analysis clearly indicate the
                 novelty of the architecture.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Thorson:2011:INc,
  author =       "Mark Thorson",
  title =        "{Internet} nuggets",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "39",
  number =       "5",
  pages =        "19--23",
  month =        dec,
  year =         "2011",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2093339.2093345",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Mar 15 14:07:10 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Compute intensive signal Processing Algorithms demand
                 efficient execution of high performance arithmetic
                 operations. Since, double base number system (DBNS)
                 offers high performance arithmetic units, it is gaining
                 attention to many researchers. However, the advantage
                 of DBNS can not be exploited due to large conversion
                 time from binary to DBNS. Keeping this issue in view,
                 this paper presents a novel conversion scheme using
                 parallel search technique.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Lymberopoulos:2012:PIW,
  author =       "Dimitrios Lymberopoulos and Oriana Riva and Karin
                 Strauss and Akshay Mittal and Alexandros Ntoulas",
  title =        "{PocketWeb}: instant web browsing for mobile devices",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "1",
  pages =        "1--12",
  month =        mar,
  year =         "2012",
  DOI =          "https://doi.org/10.1145/2189750.2150978",
  bibdate =      "Fri Jun 1 17:06:46 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ASPLOS '12 conference proceedings.",
  abstract =     "The high network latencies and limited battery life of
                 mobile phones can make mobile web browsing a
                 frustrating experience. In prior work, we proposed
                 trading memory capacity for lower web access latency
                 and a more convenient data transfer schedule from an
                 energy perspective by prefetching slowly-changing data
                 (search queries and results) nightly, when the phone is
                 charging. However, most web content is intrinsically
                 much more dynamic and may be updated multiple times a
                 day, thus eliminating the effectiveness of periodic
                 updates. This paper addresses the challenge of
                 prefetching dynamic web content in a timely fashion,
                 giving the user an instant web browsing experience but
                 without aggravating the battery lifetime issue. We
                 start by analyzing the web access traces of 8,000
                 users, and observe that mobile web browsing exhibits a
                 strong spatiotemporal signature, which is different for
                 every user. We propose to use a machine learning
                 approach based on stochastic gradient boosting
                 techniques to efficiently model this signature on a per
                 user basis. The machine learning model is capable of
                 accurately predicting future web accesses and
                 prefetching the content in a timely manner. Our
                 experimental evaluation with 48,000 models trained on
                 real user datasets shows that we can accurately
                 prefetch 60\% of the URLs for about 80--90\% of the
                 users within 2 minutes before the request. The system
                 prototype we built not only provides more than 80\%
                 lower web access time for more than 80\% of the users,
                 but it also achieves the same or lower radio energy
                 dissipation by more than 50\% for the majority of
                 mobile users.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Lin:2012:RUL,
  author =       "Felix Xiaozhu Lin and Zhen Wang and Robert LiKamWa and
                 Lin Zhong",
  title =        "{Reflex}: using low-power processors in smartphones
                 without knowing them",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "1",
  pages =        "13--24",
  month =        mar,
  year =         "2012",
  DOI =          "https://doi.org/10.1145/2189750.2150979",
  bibdate =      "Fri Jun 1 17:06:46 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ASPLOS '12 conference proceedings.",
  abstract =     "To accomplish frequent, simple tasks with high
                 efficiency, it is necessary to leverage low-power,
                 microcontroller-like processors that are increasingly
                 available on mobile systems. However, existing
                 solutions require developers to directly program the
                 low-power processors and carefully manage
                 inter-processor communication. We present Reflex, a
                 suite of compiler and runtime techniques that
                 significantly lower the barrier for developers to
                 leverage such low-power processors. The heart of Reflex
                 is a software Distributed Shared Memory (DSM) that
                 enables shared memory objects with release consistency
                 among code running on loosely coupled processors. In
                 order to achieve high energy efficiency without
                 sacrificing performance much, the Reflex DSM leverages
                 (i) extreme architectural asymmetry between low-power
                 processors and powerful central processors, (ii)
                 aggressive compile-time optimization, and (iii) a
                 minimalist runtime that supports efficient message
                 passing and event-driven execution. We report a
                 complete realization of Reflex that runs on a TI
                 OMAP4430-based development platform as well as on a
                 custom tri-processor mobile platform. Using smartphone
                 sensing applications reported in recent literature, we
                 show that Reflex supports a programming style very
                 close to contemporary smartphone programming. Compared
                 to message passing, the Reflex DSM greatly reduces
                 efforts in programming heterogeneous smartphones,
                 eliminating up to 38\% of the source lines of
                 application code. Compared to running the same
                 applications on existing smartphones, Reflex reduces
                 the average system power consumption by up to 81\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Chang:2012:TGE,
  author =       "Jichuan Chang and Justin Meza and Parthasarathy
                 Ranganathan and Amip Shah and Rocky Shih and Cullen
                 Bash",
  title =        "Totally green: evaluating and designing servers for
                 lifecycle environmental impact",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "1",
  pages =        "25--36",
  month =        mar,
  year =         "2012",
  DOI =          "https://doi.org/10.1145/2189750.2150980",
  bibdate =      "Fri Jun 1 17:06:46 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ASPLOS '12 conference proceedings.",
  abstract =     "The environmental impact of servers and datacenters is
                 an important future challenge. System architects have
                 traditionally focused on operational energy as a proxy
                 for designing green servers, but this ignores important
                 environmental implications from server production
                 (materials, manufacturing, etc.). In contrast, this
                 paper argues for a lifecycle focus on the environmental
                 impact of future server designs, to include both
                 operation and production. We present a new methodology
                 to quantify the total environmental impact of system
                 design decisions. Our approach uses the thermodynamic
                 metric of energy consumption, adapted and validated for
                 use by system architects. Using this methodology, we
                 evaluate the lifecycle impact of several example system
                 designs with environment-friendly optimizations. Our
                 results show that environmental impact from production
                 can be important (around 20\% on current servers and
                 growing) and system design choices can reduce this
                 component (by 30--40\%). Our results also highlight
                 several, sometimes unexpected, cross-interactions
                 between the environmental impact of production and
                 operation that further motivate a total lifecycle
                 emphasis for future green server designs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ferdman:2012:CCS,
  author =       "Michael Ferdman and Almutaz Adileh and Onur Kocberber
                 and Stavros Volos and Mohammad Alisafaee and Djordje
                 Jevdjic and Cansu Kaynak and Adrian Daniel Popescu and
                 Anastasia Ailamaki and Babak Falsafi",
  title =        "Clearing the clouds: a study of emerging scale-out
                 workloads on modern hardware",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "1",
  pages =        "37--48",
  month =        mar,
  year =         "2012",
  DOI =          "https://doi.org/10.1145/2189750.2150982",
  bibdate =      "Fri Jun 1 17:06:46 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ASPLOS '12 conference proceedings.",
  abstract =     "Emerging scale-out workloads require extensive amounts
                 of computational resources. However, data centers using
                 modern server hardware face physical constraints in
                 space and power, limiting further expansion and calling
                 for improvements in the computational density per
                 server and in the per-operation energy. Continuing to
                 improve the computational resources of the cloud while
                 staying within physical constraints mandates optimizing
                 server efficiency to ensure that server hardware
                 closely matches the needs of scale-out workloads. In
                 this work, we introduce CloudSuite, a benchmark suite
                 of emerging scale-out workloads. We use performance
                 counters on modern servers to study scale-out
                 workloads, finding that today's predominant processor
                 micro-architecture is inefficient for running these
                 workloads. We find that inefficiency comes from the
                 mismatch between the workload needs and modern
                 processors, particularly in the organization of
                 instruction and data memory systems and the processor
                 core micro-architecture. Moreover, while today's
                 predominant micro-architecture is inefficient when
                 executing scale-out workloads, we find that continuing
                 the current trends will further exacerbate the
                 inefficiency in the future. In this work, we identify
                 the key micro-architectural needs of scale-out
                 workloads, calling for a change in the trajectory of
                 server processors that would lead to improved
                 computational density and power efficiency in data
                 centers.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Chen:2012:IOD,
  author =       "Yang Chen and Shuangde Fang and Lieven Eeckhout and
                 Olivier Temam and Chengyong Wu",
  title =        "Iterative optimization for the data center",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "1",
  pages =        "49--60",
  month =        mar,
  year =         "2012",
  DOI =          "https://doi.org/10.1145/2189750.2150983",
  bibdate =      "Fri Jun 1 17:06:46 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ASPLOS '12 conference proceedings.",
  abstract =     "Iterative optimization is a simple but powerful
                 approach that searches for the best possible
                 combination of compiler optimizations for a given
                 workload. However, each program, if not each data set,
                 potentially favors a different combination. As a
                 result, iterative optimization is plagued by several
                 practical issues that prevent it from being widely used
                 in practice: a large number of runs are required for
                 finding the best combination; the process can be data
                 set dependent; and the exploration process incurs
                 significant overhead that needs to be compensated for
                 by performance benefits. Therefore, while iterative
                 optimization has been shown to have significant
                 performance potential, it is seldomly used in
                 production compilers. In this paper, we propose
                 Iterative Optimization for the Data Center (IODC): we
                 show that servers and data centers offer a context in
                 which all of the above hurdles can be overcome. The
                 basic idea is to spawn different combinations across
                 workers and recollect performance statistics at the
                 master, which then evolves to the optimum combination
                 of compiler optimizations. IODC carefully manages costs
                 and benefits, and is transparent to the end user. We
                 evaluate IODC using both MapReduce and throughput
                 compute-intensive server applications. In order to
                 reflect the large number of users interacting with the
                 system, we gather a very large collection of data sets
                 (at least 1000 and up to several million unique data
                 sets per program), for a total storage of 10.7TB, and
                 568 days of CPU time. We report an average performance
                 improvement of 1.48$ \times $, and up to 2.08$ \times
                 $, for the MapReduce applications, and 1.14$ \times $,
                 and up to 1.39$ \times $, for the throughput
                 compute-intensive server applications.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ahmad:2012:TOM,
  author =       "Faraz Ahmad and Srimat T. Chakradhar and Anand
                 Raghunathan and T. N. Vijaykumar",
  title =        "{Tarazu}: optimizing {MapReduce} on heterogeneous
                 clusters",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "1",
  pages =        "61--74",
  month =        mar,
  year =         "2012",
  DOI =          "https://doi.org/10.1145/2189750.2150984",
  bibdate =      "Fri Jun 1 17:06:46 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ASPLOS '12 conference proceedings.",
  abstract =     "Data center-scale clusters are evolving towards
                 heterogeneous hardware for power, cost, differentiated
                 price-performance, and other reasons. MapReduce is a
                 well-known programming model to process large amount of
                 data on data center-scale clusters. Most MapReduce
                 implementations have been designed and optimized for
                 homogeneous clusters. Unfortunately, these
                 implementations perform poorly on heterogeneous
                 clusters (e.g., on a 90-node cluster that contains 10
                 Xeon-based servers and 80 Atom-based servers, Hadoop
                 performs worse than on 10-node Xeon-only or 80-node
                 Atom-only homogeneous sub-clusters for many of our
                 benchmarks). This poor performance remains despite
                 previously proposed optimizations related to management
                 of straggler tasks. In this paper, we address
                 MapReduce's poor performance on heterogeneous clusters.
                 Our first contribution is that the poor performance is
                 due to two key factors: (1) the non-intuitive effect
                 that MapReduce's built-in load balancing results in
                 excessive and bursty network communication during the
                 Map phase, and (2) the intuitive effect that the
                 heterogeneity amplifies load imbalance in the Reduce
                 computation. Our second contribution is Tarazu, a suite
                 of optimizations to improve MapReduce performance on
                 heterogeneous clusters. Tarazu consists of (1)
                 Communication-Aware Load Balancing of Map computation
                 (CALB) across the nodes, (2) Communication-Aware
                 Scheduling of Map computation (CAS) to avoid bursty
                 network traffic and (3) Predictive Load Balancing of
                 Reduce computation (PLB) across the nodes. Using the
                 above 90-node cluster, we show that Tarazu
                 significantly improves performance over a baseline of
                 Hadoop with straightforward tuning for hardware
                 heterogeneity.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Govindan:2012:LSE,
  author =       "Sriram Govindan and Di Wang and Anand Sivasubramaniam
                 and Bhuvan Urgaonkar",
  title =        "Leveraging stored energy for handling power
                 emergencies in aggressively provisioned datacenters",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "1",
  pages =        "75--86",
  month =        mar,
  year =         "2012",
  DOI =          "https://doi.org/10.1145/2189750.2150985",
  bibdate =      "Fri Jun 1 17:06:46 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ASPLOS '12 conference proceedings.",
  abstract =     "Datacenters spend \$10--25 per watt in provisioning
                 their power infrastructure, regardless of the watts
                 actually consumed. Since peak power needs arise rarely,
                 provisioning power infrastructure for them can be
                 expensive. One can, thus, aggressively under-provision
                 infrastructure assuming that simultaneous peak draw
                 across all equipment will happen rarely. The resulting
                 non-zero probability of emergency events where power
                 needs exceed provisioned capacity, however small,
                 mandates graceful reaction mechanisms to cap the power
                 draw instead of leaving it to disruptive circuit
                 breakers/fuses. Existing strategies for power capping
                 use temporal knobs local to a server that throttle the
                 rate of execution (using power modes), and/or spatial
                 knobs that redirect/migrate excess load to regions of
                 the datacenter with more power headroom. We show these
                 mechanisms to have performance degrading ramifications,
                 and propose an entirely orthogonal solution that
                 leverages existing UPS batteries to temporarily augment
                 the utility supply during emergencies. We build an
                 experimental prototype to demonstrate such power
                 capping on a cluster of 8 servers, each with an
                 individual battery, and implement several online
                 heuristics in the context of different datacenter
                 workloads to evaluate their effectiveness in handling
                 power emergencies. We show that: (i) our battery-based
                 solution can handle emergencies of short duration on
                 its own, (ii) supplement existing reaction mechanisms
                 to enhance their efficacy for longer emergencies, and
                 (iii) battery even provide feasible options when other
                 knobs do not suffice.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kadav:2012:UMD,
  author =       "Asim Kadav and Michael M. Swift",
  title =        "Understanding modern device drivers",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "1",
  pages =        "87--98",
  month =        mar,
  year =         "2012",
  DOI =          "https://doi.org/10.1145/2189750.2150987",
  bibdate =      "Fri Jun 1 17:06:46 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ASPLOS '12 conference proceedings.",
  abstract =     "Device drivers are the single largest contributor to
                 operating-system kernel code with over 5 million lines
                 of code in the Linux kernel, and cause significant
                 complexity, bugs and development costs. Recent years
                 have seen a flurry of research aimed at improving the
                 reliability and simplifying the development of drivers.
                 However, little is known about what constitutes this
                 huge body of code beyond the small set of drivers used
                 for research. In this paper, we study the source code
                 of Linux drivers to understand what drivers actually
                 do, how current research applies to them and what
                 opportunities exist for future research. We determine
                 whether assumptions made by most driver research, such
                 as that all drivers belong to a class, are indeed true.
                 We also analyze driver code and abstractions to
                 determine whether drivers can benefit from code
                 re-organization or hardware trends. We develop a set of
                 static-analysis tools to analyze driver code across
                 various axes. Broadly, our study looks at three aspects
                 of driver code (i) what are the characteristics of
                 driver code functionality and how applicable is driver
                 research to all drivers, (ii) how do drivers interact
                 with the kernel, devices, and buses, and (iii) are
                 there similarities that can be abstracted into
                 libraries to reduce driver size and complexity? We find
                 that many assumptions made by driver research do not
                 apply to all drivers. At least 44\% of drivers have
                 code that is not captured by a class definition, 28\%
                 of drivers support more than one device per driver, and
                 15\% of drivers do significant computation over data.
                 From the driver interactions study, we find USB bus
                 offers an efficient bus interface with significant
                 standardized code and coarse-grained access, ideal for
                 executing drivers in isolation. We also find that
                 drivers for different buses and classes have widely
                 varying levels of device interaction, which indicates
                 that the cost of isolation will vary by class. Finally,
                 from our driver similarity study, we find 8\% of all
                 driver code is substantially similar to code elsewhere
                 and may be removed with new abstractions or
                 libraries.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Panneerselvam:2012:COS,
  author =       "Sankaralingam Panneerselvam and Michael M. Swift",
  title =        "{Chameleon}: operating system support for dynamic
                 processors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "1",
  pages =        "99--110",
  month =        mar,
  year =         "2012",
  DOI =          "https://doi.org/10.1145/2189750.2150988",
  bibdate =      "Fri Jun 1 17:06:46 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ASPLOS '12 conference proceedings.",
  abstract =     "The rise of multi-core processors has shifted
                 performance efforts towards parallel programs. However,
                 single-threaded code, whether from legacy programs or
                 ones difficult to parallelize, remains important.
                 Proposed asymmetric multicore processors statically
                 dedicate hardware to improve sequential performance,
                 but at the cost of reduced parallel performance.
                 However, several proposed mechanisms provide the
                 best-of-both-worlds by combining multiple cores into a
                 single, more powerful processor for sequential code.
                 For example, Core Fusion merges multiple cores to pool
                 caches and functional units, and Intel's Turbo Boost
                 raises the clock speed of a core if the other cores on
                 a chip are powered down. These reconfiguration
                 mechanisms have two important properties. First the set
                 of available cores and their capabilities can vary over
                 short time scales. Current operating systems are not
                 designed for rapidly changing hardware: the existing
                 hotplug mechanisms for reconfiguring processors require
                 global operations and hundreds of milliseconds to
                 complete. Second, configurations may be mutually
                 exclusive: using power to speed one core means it
                 cannot be used to speed another. Current schedulers
                 cannot manage this requirement. We present Chameleon,
                 an extension to Linux to support dynamic processors
                 that can reconfigure their cores at runtime. Chameleon
                 provides processor proxies to enable rapid
                 reconfiguration, execution objects to abstract the
                 processing capabilities of physical CPUs, and a cluster
                 scheduler to balance the needs of sequential and
                 parallel programs. In experiments that emulate a
                 dynamic processor, we find that Chameleon can
                 reconfigure processors 100,000 times faster than Linux
                 and allows applications full access to hardware
                 capabilities: sequential code runs at full speed on a
                 powerful execution context, while parallel code runs on
                 as many cores as possible.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hwang:2012:CRD,
  author =       "Andy A. Hwang and Ioan A. Stefanovici and Bianca
                 Schroeder",
  title =        "Cosmic rays don't strike twice: understanding the
                 nature of {DRAM} errors and the implications for system
                 design",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "1",
  pages =        "111--122",
  month =        mar,
  year =         "2012",
  DOI =          "https://doi.org/10.1145/2189750.2150989",
  bibdate =      "Fri Jun 1 17:06:46 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ASPLOS '12 conference proceedings.",
  abstract =     "Main memory is one of the leading hardware causes for
                 machine crashes in today's datacenters. Designing,
                 evaluating and modeling systems that are resilient
                 against memory errors requires a good understanding of
                 the underlying characteristics of errors in DRAM in the
                 field. While there have recently been a few first
                 studies on DRAM errors in production systems, these
                 have been too limited in either the size of the data
                 set or the granularity of the data to conclusively
                 answer many of the open questions on DRAM errors. Such
                 questions include, for example, the prevalence of soft
                 errors compared to hard errors, or the analysis of
                 typical patterns of hard errors. In this paper, we
                 study data on DRAM errors collected on a diverse range
                 of production systems in total covering nearly 300
                 terabyte-years of main memory. As a first contribution,
                 we provide a detailed analytical study of DRAM error
                 characteristics, including both hard and soft errors.
                 We find that a large fraction of DRAM errors in the
                 field can be attributed to hard errors and we provide a
                 detailed analytical study of their characteristics. As
                 a second contribution, the paper uses the results from
                 the measurement study to identify a number of promising
                 directions for designing more resilient systems and
                 evaluates the potential of different protection
                 mechanisms in the light of realistic error patterns.
                 One of our findings is that simple page retirement
                 policies might be able to mask a large number of DRAM
                 errors in production systems, while sacrificing only a
                 negligible fraction of the total DRAM in the system.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hari:2012:REA,
  author =       "Siva Kumar Sastry Hari and Sarita V. Adve and Helia
                 Naeimi and Pradeep Ramachandran",
  title =        "{Relyzer}: exploiting application-level fault
                 equivalence to analyze application resiliency to
                 transient faults",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "1",
  pages =        "123--134",
  month =        mar,
  year =         "2012",
  DOI =          "https://doi.org/10.1145/2189750.2150990",
  bibdate =      "Fri Jun 1 17:06:46 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ASPLOS '12 conference proceedings.",
  abstract =     "Future microprocessors need low-cost solutions for
                 reliable operation in the presence of failure-prone
                 devices. A promising approach is to detect hardware
                 faults by deploying low-cost monitors of software-level
                 symptoms of such faults. Recently, researchers have
                 shown these mechanisms work well, but there remains a
                 non-negligible risk that several faults may escape the
                 symptom detectors and result in silent data corruptions
                 (SDCs). Most prior evaluations of symptom-based
                 detectors perform fault injection campaigns on
                 application benchmarks, where each run simulates the
                 impact of a fault injected at a hardware site at a
                 certain point in the application's execution
                 (application fault site). Since the total number of
                 application fault sites is very large (trillions for
                 standard benchmark suites), it is not feasible to study
                 all possible faults. Previous work therefore typically
                 studies a randomly selected sample of faults. Such
                 studies do not provide any feedback on the portions of
                 the application where faults were not injected. Some of
                 those instructions may be vulnerable to SDCs, and
                 identifying them could allow protecting them through
                 other means if needed. This paper presents Relyzer, an
                 approach that systematically analyzes all application
                 fault sites and carefully picks a small subset to
                 perform selective fault injections for transient
                 faults. Relyzer employs novel fault pruning techniques
                 that prune faults that need detailed study by either
                 predicting their outcomes or showing them equivalent to
                 other faults. We find that Relyzer prunes about 99.78\%
                 of the total faults across twelve applications studied
                 here, reducing the faults that require detailed
                 simulation by 3 to 5 orders of magnitude for most of
                 the applications. Fault injection simulations on the
                 remaining faults can identify SDC causing faults in the
                 entire application. Some of Relyzer's techniques rely
                 on heuristics to determine fault equivalence. Our
                 validation efforts show that Relyzer determines fault
                 outcomes with 96\% accuracy, averaged across all the
                 applications studied here.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Feiner:2012:CKI,
  author =       "Peter Feiner and Angela Demke Brown and Ashvin Goel",
  title =        "Comprehensive kernel instrumentation via dynamic
                 binary translation",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "1",
  pages =        "135--146",
  month =        mar,
  year =         "2012",
  DOI =          "https://doi.org/10.1145/2189750.2150992",
  bibdate =      "Fri Jun 1 17:06:46 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ASPLOS '12 conference proceedings.",
  abstract =     "Dynamic binary translation (DBT) is a powerful
                 technique that enables fine-grained monitoring and
                 manipulation of an existing program binary. At the user
                 level, it has been employed extensively to develop
                 various analysis, bug-finding, and security tools. Such
                 tools are currently not available for operating system
                 (OS) binaries since no comprehensive DBT framework
                 exists for the OS kernel. To address this problem, we
                 have developed a DBT framework that runs as a Linux
                 kernel module, based on the user-level DynamoRIO
                 framework. Our approach is unique in that it controls
                 all kernel execution, including interrupt and exception
                 handlers and device drivers, enabling comprehensive
                 instrumentation of the OS without imposing any overhead
                 on user-level code. In this paper, we discuss the key
                 challenges in designing and building an in-kernel DBT
                 framework and how the design differs from user-space.
                 We use our framework to build several sample
                 instrumentations, including simple instruction counting
                 as well as an implementation of shadow memory for the
                 kernel. Using the shadow memory, we build a kernel
                 stack overflow protection tool and a memory
                 addressability checking tool. Qualitatively, the system
                 is fast enough and stable enough to run the normal
                 desktop workload of one of the authors for several
                 weeks.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Odaira:2012:COA,
  author =       "Rei Odaira and Toshio Nakatani",
  title =        "Continuous object access profiling and optimizations
                 to overcome the memory wall and bloat",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "1",
  pages =        "147--158",
  month =        mar,
  year =         "2012",
  DOI =          "https://doi.org/10.1145/2189750.2150993",
  bibdate =      "Fri Jun 1 17:06:46 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ASPLOS '12 conference proceedings.",
  abstract =     "Future microprocessors will have more serious memory
                 wall problems since they will include more cores and
                 threads in each chip. Similarly, future applications
                 will have more serious memory bloat problems since they
                 are more often written using object-oriented languages
                 and reusable frameworks. To overcome such problems, the
                 language runtime environments must accurately and
                 efficiently profile how programs access objects. We
                 propose Barrier Profiler, a low-overhead object access
                 profiler using a memory-protection-based approach
                 called pointer barrierization and adaptive overhead
                 reduction techniques. Unlike previous
                 memory-protection-based techniques, pointer
                 barrierization offers per-object protection by
                 converting all of the pointers to a given object to
                 corresponding barrier pointers that point to protected
                 pages. Barrier Profiler achieves low overhead by not
                 causing signals at object accesses that are unrelated
                 to the needed profiles, based on profile feedback and a
                 compiler analysis. Our experimental results showed
                 Barrier Profiler provided sufficiently accurate
                 profiles with 1.3\% on average and at most 3.4\%
                 performance overhead for allocation-intensive
                 benchmarks, while previous code-instrumentation-based
                 techniques suffered from 9.2\% on average and at most
                 12.6\% overhead. The low overhead allows Barrier
                 Profiler to be run continuously on production systems.
                 Using Barrier Profiler, we implemented two new online
                 optimizations to compress write-only character arrays
                 and to adjust the initial sizes of mostly non-accessed
                 arrays. They resulted in speed-ups of up to 8.6\% and
                 36\%, respectively.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Greathouse:2012:CUW,
  author =       "Joseph L. Greathouse and Hongyi Xin and Yixin Luo and
                 Todd Austin",
  title =        "A case for unlimited watchpoints",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "1",
  pages =        "159--172",
  month =        mar,
  year =         "2012",
  DOI =          "https://doi.org/10.1145/2189750.2150994",
  bibdate =      "Fri Jun 1 17:06:46 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ASPLOS '12 conference proceedings.",
  abstract =     "Numerous tools have been proposed to help developers
                 fix software errors and inefficiencies. Widely-used
                 techniques such as memory checking suffer from
                 overheads that limit their use to pre-deployment
                 testing, while more advanced systems have such severe
                 performance impacts that they may require
                 special-purpose hardware. Previous works have described
                 hardware that can accelerate individual analyses, but
                 such specialization stymies adoption; generalized
                 mechanisms are more likely to be added to commercial
                 processors. This paper demonstrates that the ability to
                 set an unlimited number of fine-grain data watchpoints
                 can reduce the runtime overheads of numerous dynamic
                 software analysis techniques. We detail the watchpoint
                 capabilities required to accelerate these analyses
                 while remaining general enough to be useful in the
                 future. We describe a hardware design that stores
                 watchpoints in main memory and utilizes two different
                 on-chip caches to accelerate performance. The first is
                 a bitmap lookaside buffer that stores fine-grained
                 watchpoints, while the second is a range cache that can
                 efficiently hold large contiguous regions of
                 watchpoints. As an example of the power of such a
                 system, it is possible to use watchpoints to accelerate
                 read/write set checks in a software data race detector
                 by nearly 9$ \times $.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Olszewski:2012:AAS,
  author =       "Marek Olszewski and Qin Zhao and David Koh and Jason
                 Ansel and Saman Amarasinghe",
  title =        "{Aikido}: accelerating shared data dynamic analyses",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "1",
  pages =        "173--184",
  month =        mar,
  year =         "2012",
  DOI =          "https://doi.org/10.1145/2189750.2150995",
  bibdate =      "Fri Jun 1 17:06:46 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ASPLOS '12 conference proceedings.",
  abstract =     "Despite a burgeoning demand for parallel programs, the
                 tools available to developers working on shared-memory
                 multicore processors have lagged behind. One reason for
                 this is the lack of hardware support for inspecting the
                 complex behavior of these parallel programs.
                 Inter-thread communication, which must be instrumented
                 for many types of analyses, may occur with any memory
                 operation. To detect such thread communication in
                 software, many existing tools require the
                 instrumentation of all memory operations, which leads
                 to significant performance overheads. To reduce this
                 overhead, some existing tools resort to random sampling
                 of memory operations, which introduces false negatives.
                 Unfortunately, neither of these approaches provide the
                 speed and accuracy programmers have traditionally
                 expected from their tools. In this work, we present
                 Aikido, a new system and framework that enables the
                 development of efficient and transparent analyses that
                 operate on shared data. Aikido uses a hybrid of
                 existing hardware features and dynamic binary rewriting
                 to detect thread communication with low overhead.
                 Aikido runs a custom hypervisor below the operating
                 system, which exposes per-thread hardware protection
                 mechanisms not available in any widely used operating
                 system. This hybrid approach allows us to benefit from
                 the low cost of detecting memory accesses with
                 hardware, while maintaining the word-level accuracy of
                 a software-only approach. To evaluate our framework, we
                 have implemented an Aikido-enabled vector clock race
                 detector. Our results show that the Aikido enabled
                 race-detector outperforms existing techniques that
                 provide similar accuracy by up to 6.0x, and 76\% on
                 average, on the PARSEC benchmark suite.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kasikci:2012:DRV,
  author =       "Baris Kasikci and Cristian Zamfir and George Candea",
  title =        "Data races vs. data race bugs: telling the difference
                 with {Portend}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "1",
  pages =        "185--198",
  month =        mar,
  year =         "2012",
  DOI =          "https://doi.org/10.1145/2189750.2150997",
  bibdate =      "Fri Jun 1 17:06:46 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ASPLOS '12 conference proceedings.",
  abstract =     "Even though most data races are harmless, the harmful
                 ones are at the heart of some of the worst concurrency
                 bugs. Alas, spotting just the harmful data races in
                 programs is like finding a needle in a haystack:
                 76\%--90\% of the true data races reported by
                 state-of-the-art race detectors turn out to be harmless
                 [45]. We present Portend, a tool that not only detects
                 races but also automatically classifies them based on
                 their potential consequences: Could they lead to
                 crashes or hangs? Could their effects be visible
                 outside the program? Are they harmless? Our proposed
                 technique achieves high accuracy by efficiently
                 analyzing multiple paths and multiple thread schedules
                 in combination, and by performing symbolic comparison
                 between program outputs. We ran Portend on 7 real-world
                 applications: it detected 93 true data races and
                 correctly classified 92 of them, with no human effort.
                 6 of them are harmful races. Portend's classification
                 accuracy is up to 88\% higher than that of existing
                 tools, and it produces easy-to-understand evidence of
                 the consequences of harmful races, thus both proving
                 their harmfulness and making debugging easier. We
                 envision Portend being used for testing and debugging,
                 as well as for automatically triaging bug reports.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Clements:2012:SAS,
  author =       "Austin T. Clements and M. Frans Kaashoek and Nickolai
                 Zeldovich",
  title =        "Scalable address spaces using {RCU} balanced trees",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "1",
  pages =        "199--210",
  month =        mar,
  year =         "2012",
  DOI =          "https://doi.org/10.1145/2189750.2150998",
  bibdate =      "Fri Jun 1 17:06:46 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ASPLOS '12 conference proceedings.",
  abstract =     "Software developers commonly exploit multicore
                 processors by building multithreaded software in which
                 all threads of an application share a single address
                 space. This shared address space has a cost: kernel
                 virtual memory operations such as handling soft page
                 faults, growing the address space, mapping files, etc.
                 can limit the scalability of these applications. In
                 widely-used operating systems, all of these operations
                 are synchronized by a single per-process lock. This
                 paper contributes a new design for increasing the
                 concurrency of kernel operations on a shared address
                 space by exploiting read-copy-update (RCU) so that soft
                 page faults can both run in parallel with operations
                 that mutate the same address space and avoid contending
                 with other page faults on shared cache lines. To enable
                 such parallelism, this paper also introduces an
                 RCU-based binary balanced tree for storing memory
                 mappings. An experimental evaluation using three
                 multithreaded applications shows performance
                 improvements on 80 cores ranging from 1.7x to 3.4x for
                 an implementation of this design in the Linux 2.6.37
                 kernel. The RCU-based binary tree enables soft page
                 faults to run at a constant cost with an increasing
                 number of cores,suggesting that the design will scale
                 well beyond 80 cores.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Volos:2012:ATM,
  author =       "Haris Volos and Andres Jaan Tack and Michael M. Swift
                 and Shan Lu",
  title =        "Applying transactional memory to concurrency bugs",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "1",
  pages =        "211--222",
  month =        mar,
  year =         "2012",
  DOI =          "https://doi.org/10.1145/2189750.2150999",
  bibdate =      "Fri Jun 1 17:06:46 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ASPLOS '12 conference proceedings.",
  abstract =     "Multithreaded programs often suffer from
                 synchronization bugs such as atomicity violations and
                 deadlocks. These bugs arise from complicated locking
                 strategies and ad hoc synchronization methods to avoid
                 the use of locks. A survey of the bug databases of
                 major open-source applications shows that concurrency
                 bugs often take multiple fix attempts, and that fixes
                 often introduce yet more concurrency bugs.
                 Transactional memory (TM) enables programmers to
                 declare regions of code atomic without specifying a
                 lock and has the potential to avoid these bugs. Where
                 most previous studies have focused on using TM to write
                 new programs from scratch, we consider its utility in
                 fixing existing programs with concurrency bugs. We
                 therefore investigate four methods of using TM on three
                 concurrent programs. Overall, we find that 29\% of the
                 bugs are not fixable by transactional memory, showing
                 that TM does not address many important types of
                 concurrency bugs. In particular, TM works poorly with
                 extremely long critical sections and with deadlocks
                 involving both condition variables and I/O. Conversely,
                 we find that for 56\% of the bugs, transactional memory
                 offers demonstrable value by simplifying the reasoning
                 behind a fix or the effort to implement a fix, and
                 using transactions in the first place would have
                 avoided 71\% of the bugs examined. We also find that ad
                 hoc synchronization put in place to avoid the overhead
                 of locking can be greatly simplified with TM, but
                 requires hardware support to perform well.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Joao:2012:BIS,
  author =       "Jos{\'e} A. Joao and M. Aater Suleman and Onur Mutlu
                 and Yale N. Patt",
  title =        "Bottleneck identification and scheduling in
                 multithreaded applications",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "1",
  pages =        "223--234",
  month =        mar,
  year =         "2012",
  DOI =          "https://doi.org/10.1145/2189750.2151001",
  bibdate =      "Fri Jun 1 17:06:46 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ASPLOS '12 conference proceedings.",
  abstract =     "Performance of multithreaded applications is limited
                 by a variety of bottlenecks, e.g. critical sections,
                 barriers and slow pipeline stages. These bottlenecks
                 serialize execution, waste valuable execution cycles,
                 and limit scalability of applications. This paper
                 proposes Bottleneck Identification and Scheduling in
                 Multithreaded Applications (BIS), a cooperative
                 software-hardware mechanism to identify and accelerate
                 the most critical bottlenecks. BIS identifies which
                 bottlenecks are likely to reduce performance by
                 measuring the number of cycles threads have to wait for
                 each bottleneck, and accelerates those bottlenecks
                 using one or more fast cores on an Asymmetric Chip
                 Multi-Processor (ACMP). Unlike previous work that
                 targets specific bottlenecks, BIS can identify and
                 accelerate bottlenecks regardless of their type. We
                 compare BIS to four previous approaches and show that
                 it outperforms the best of them by 15\% on average.
                 BIS' performance improvement increases as the number of
                 cores and the number of fast cores in the system
                 increase.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Radojkovic:2012:OTA,
  author =       "Petar Radojkovi{\'c} and Vladimir Cakarevi{\'c} and
                 Miquel Moret{\'o} and Javier Verd{\'u} and Alex Pajuelo
                 and Francisco J. Cazorla and Mario Nemirovsky and Mateo
                 Valero",
  title =        "Optimal task assignment in multithreaded processors: a
                 statistical approach",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "1",
  pages =        "235--248",
  month =        mar,
  year =         "2012",
  DOI =          "https://doi.org/10.1145/2189750.2151002",
  bibdate =      "Fri Jun 1 17:06:46 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ASPLOS '12 conference proceedings.",
  abstract =     "The introduction of massively multithreaded (MMT)
                 processors, comprised of a large number of cores with
                 many shared resources, has made task scheduling, in
                 particular task to hardware thread assignment, one of
                 the most promising ways to improve system performance.
                 However, finding an optimal task assignment for a
                 workload running on MMT processors is an NP-complete
                 problem. Due to the fact that the performance of the
                 best possible task assignment is unknown, the room for
                 improvement of current task-assignment algorithms
                 cannot be determined. This is a major problem for the
                 industry because it could lead to: (1)~A waste of
                 resources if excessive effort is devoted to improving a
                 task assignment algorithm that already provides a
                 performance that is close to the optimal one, or
                 (2)~significant performance loss if insufficient effort
                 is devoted to improving poorly-performing task
                 assignment algorithms. In this paper, we present a
                 method based on Extreme Value Theory that allows the
                 prediction of the performance of the optimal task
                 assignment in MMT processors. We further show that
                 executing a sample of several hundred or several
                 thousand random task assignments is enough to obtain,
                 with very high confidence, an assignment with a
                 performance that is close to the optimal one. We
                 validate our method with an industrial case study for a
                 set of multithreaded network applications running on an
                 UltraSPARC~T2 processor.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Jaleel:2012:CCR,
  author =       "Aamer Jaleel and Hashem H. Najaf-abadi and Samantika
                 Subramaniam and Simon C. Steely and Joel Emer",
  title =        "{CRUISE}: cache replacement and utility-aware
                 scheduling",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "1",
  pages =        "249--260",
  month =        mar,
  year =         "2012",
  DOI =          "https://doi.org/10.1145/2189750.2151003",
  bibdate =      "Fri Jun 1 17:06:46 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ASPLOS '12 conference proceedings.",
  abstract =     "When several applications are co-scheduled to run on a
                 system with multiple shared LLCs, there is opportunity
                 to improve system performance. This opportunity can be
                 exploited by the hardware, software, or a combination
                 of both hardware and software. The software, i.e., an
                 operating system or hypervisor, can improve system
                 performance by co-scheduling jobs on LLCs to minimize
                 shared cache contention. The hardware can improve
                 system throughput through better replacement policies
                 by allocating more cache resources to applications that
                 benefit from the cache and less to those applications
                 that do not. This study presents a detailed analysis on
                 the interactions between intelligent scheduling and
                 smart cache replacement policies. We find that smart
                 cache replacement reduces the burden on software to
                 provide intelligent scheduling decisions. However,
                 under smart cache replacement, there is still room to
                 improve performance from better application
                 co-scheduling. We find that co-scheduling decisions are
                 a function of the underlying LLC replacement policy. We
                 propose Cache Replacement and Utility-aware Scheduling
                 (CRUISE)-a hardware/software co-designed approach for
                 shared cache management. For 4-core and 8-core CMPs, we
                 find that CRUISE approaches the performance of an ideal
                 job co-scheduling policy under different LLC
                 replacement policies.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{DeVuyst:2012:EMH,
  author =       "Matthew DeVuyst and Ashish Venkat and Dean M.
                 Tullsen",
  title =        "Execution migration in a heterogeneous-{ISA} chip
                 multiprocessor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "1",
  pages =        "261--272",
  month =        mar,
  year =         "2012",
  DOI =          "https://doi.org/10.1145/2189750.2151004",
  bibdate =      "Fri Jun 1 17:06:46 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ASPLOS '12 conference proceedings.",
  abstract =     "Prior research has shown that single-ISA heterogeneous
                 chip multiprocessors have the potential for greater
                 performance and energy efficiency than homogeneous
                 CMPs. However, restricting the cores to a single ISA
                 removes an important opportunity for greater
                 heterogeneity. To take full advantage of a
                 heterogeneous-ISA CMP, however, we must be able to
                 migrate execution among heterogeneous cores in order to
                 adapt to program phase changes and changing external
                 conditions (e.g., system power state). This paper
                 explores migration on heterogeneous-ISA CMPs. This is
                 non-trivial because program state is kept in an
                 architecture-specific form; therefore, state
                 transformation is necessary for migration. To keep
                 migration cost low, the amount of state that requires
                 transformation must be minimized. This work identifies
                 large portions of program state whose form is not
                 critical for performance; the compiler is modified to
                 produce programs that keep most of their state in an
                 architecture-neutral form so that only a small number
                 of data items must be repositioned and no pointers need
                 to be changed. The result is low migration cost with
                 minimal sacrifice of non-migration performance.
                 Additionally, this work leverages binary translation to
                 enable instantaneous migration. When migration is
                 requested, the program is immediately migrated to a
                 different core where binary translation runs for a
                 short time until a function call is reached, at which
                 point program state is transformed and execution
                 continues natively on the new core. This system can
                 tolerate migrations as often as every 100 ms and still
                 retain 95\% of the performance of a system that does
                 not do, or support, migration.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Lin:2012:ESC,
  author =       "Changhui Lin and Vijay Nagarajan and Rajiv Gupta and
                 Bharghava Rajaram",
  title =        "Efficient sequential consistency via conflict
                 ordering",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "1",
  pages =        "273--286",
  month =        mar,
  year =         "2012",
  DOI =          "https://doi.org/10.1145/2189750.2151006",
  bibdate =      "Fri Jun 1 17:06:46 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ASPLOS '12 conference proceedings.",
  abstract =     "Although the sequential consistency (SC) model is the
                 most intuitive, processor designers often choose to
                 support relaxed memory consistency models for higher
                 performance. This is because SC implementations that
                 match the performance of relaxed memory models require
                 post-retirement speculation and its associated hardware
                 costs. In this paper we propose an efficient approach
                 for enforcing SC without requiring post-retirement
                 speculation. While prior SC implementations guarantee
                 SC by explicitly completing memory operations within a
                 processor in program order, we guarantee SC by
                 completing conflicting memory operations, within and
                 across processors, in an order that is consistent with
                 the program order. More specifically, we identify those
                 conflicting memory operations whose ordering is
                 critical for the maintenance of SC and explicitly order
                 them. This allows us to safely (non-speculatively)
                 complete memory operations past pending writes, thus
                 reducing memory ordering stalls. Our experiments with
                 SPLASH-2 programs show that SC can be achieved
                 efficiently, with performance comparable to RMO
                 (relaxed memory order).",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Cheriton:2012:HAS,
  author =       "David Cheriton and Amin Firoozshahian and Alex
                 Solomatnikov and John P. Stevenson and Omid Azizi",
  title =        "{HICAMP}: architectural support for efficient
                 concurrency-safe shared structured data access",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "1",
  pages =        "287--300",
  month =        mar,
  year =         "2012",
  DOI =          "https://doi.org/10.1145/2189750.2151007",
  bibdate =      "Fri Jun 1 17:06:46 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ASPLOS '12 conference proceedings.",
  abstract =     "Programming language and operating system support for
                 efficient concurrency-safe access to shared data is a
                 key concern for the effective use of multi-core
                 processors. Most research has focused on the software
                 model of multiple threads accessing this data within a
                 single shared address space. However, many real
                 applications are actually structured as multiple
                 separate processes for fault isolation and simplified
                 synchronization. In this paper, we describe the HICAMP
                 architecture and its innovative memory system, which
                 supports efficient concurrency safe access to
                 structured shared data without incurring the overhead
                 of inter-process communication. The HICAMP architecture
                 also provides support for programming language and OS
                 structures such as threads, iterators, read-only access
                 and atomic update. In addition to demonstrating that
                 HICAMP is beneficial for multi-process structured
                 applications, our evaluation shows that the same
                 mechanisms provide substantial benefits for other
                 areas, including sparse matrix computations and
                 virtualization.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Esmaeilzadeh:2012:ASD,
  author =       "Hadi Esmaeilzadeh and Adrian Sampson and Luis Ceze and
                 Doug Burger",
  title =        "Architecture support for disciplined approximate
                 programming",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "1",
  pages =        "301--312",
  month =        mar,
  year =         "2012",
  DOI =          "https://doi.org/10.1145/2189750.2151008",
  bibdate =      "Fri Jun 1 17:06:46 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ASPLOS '12 conference proceedings.",
  abstract =     "Disciplined approximate programming lets programmers
                 declare which parts of a program can be computed
                 approximately and consequently at a lower energy cost.
                 The compiler proves statically that all approximate
                 computation is properly isolated from precise
                 computation. The hardware is then free to selectively
                 apply approximate storage and approximate computation
                 with no need to perform dynamic correctness checks. In
                 this paper, we propose an efficient mapping of
                 disciplined approximate programming onto hardware. We
                 describe an ISA extension that provides approximate
                 operations and storage, which give the hardware freedom
                 to save energy at the cost of accuracy. We then propose
                 Truffle, a microarchitecture design that efficiently
                 supports the ISA extensions. The basis of our design is
                 dual-voltage operation, with a high voltage for precise
                 operations and a low voltage for approximate
                 operations. The key aspect of the microarchitecture is
                 its dependence on the instruction stream to determine
                 when to use the low voltage. We evaluate the power
                 savings potential of in-order and out-of-order Truffle
                 configurations and explore the resulting quality of
                 service degradation. We evaluate several applications
                 and demonstrate energy savings up to 43\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Meisner:2012:DAS,
  author =       "David Meisner and Thomas F. Wenisch",
  title =        "{DreamWeaver}: architectural support for deep sleep",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "1",
  pages =        "313--324",
  month =        mar,
  year =         "2012",
  DOI =          "https://doi.org/10.1145/2189750.2151009",
  bibdate =      "Fri Jun 1 17:06:46 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ASPLOS '12 conference proceedings.",
  abstract =     "Numerous data center services exhibit low average
                 utilization leading to poor energy efficiency. Although
                 CPU voltage and frequency scaling historically has been
                 an effective means to scale down power with
                 utilization, transistor scaling trends are limiting its
                 effectiveness and the CPU is accounting for a shrinking
                 fraction of system power. Recent research advocates the
                 use of full-system idle low-power modes to combat
                 energy losses, as such modes provide the deepest power
                 savings with bounded response time impact. However, the
                 trend towards increasing cores per die is undermining
                 the effectiveness of these sleep modes, particularly
                 for request-parallel data center applications, because
                 the independent idle periods across individual cores
                 are unlikely to align by happenstance. We propose
                 DreamWeaver, architectural support to facilitate deep
                 sleep for request-parallel applications on multicore
                 servers. DreamWeaver comprises two elements: Weave
                 Scheduling, a scheduling policy to coalesce idle and
                 busy periods across cores to create opportunities for
                 system-wide deep sleep; and the Dream Processor, a
                 light-weight co-processor that monitors incoming
                 network traffic and suspended work during sleep to
                 determine when the system must wake. DreamWeaver is
                 based on two key concepts: (1) stall execution and
                 sleep anytime any core is unoccupied, but (2) constrain
                 the maximum time any request may be stalled. Unlike
                 prior scheduling approaches, DreamWeaver will preempt
                 execution to sleep, maximizing time spent at the
                 systems' most efficient operating point. We demonstrate
                 that DreamWeaver can smoothly trade-off bounded,
                 predictable increases in 99th-percentile response time
                 for increasing power savings, and strictly dominates
                 the savings available with voltage and frequency
                 scaling and timeout-based request batching schemes.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{King:2012:AGH,
  author =       "Myron King and Nirav Dave and Arvind",
  title =        "Automatic generation of hardware\slash software
                 interfaces",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "1",
  pages =        "325--336",
  month =        mar,
  year =         "2012",
  DOI =          "https://doi.org/10.1145/2189750.2151011",
  bibdate =      "Fri Jun 1 17:06:46 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ASPLOS '12 conference proceedings.",
  abstract =     "Enabling new applications for mobile devices often
                 requires the use of specialized hardware to reduce
                 power consumption. Because of time-to-market pressure,
                 current design methodologies for embedded applications
                 require an early partitioning of the design, allowing
                 the hardware and software to be developed
                 simultaneously, each adhering to a rigid interface
                 contract. This approach is problematic for two reasons:
                 (1) a detailed hardware-software interface is difficult
                 to specify until one is deep into the design process,
                 and (2) it prevents the later migration of
                 functionality across the interface motivated by
                 efficiency concerns or the addition of features. We
                 address this problem using the Bluespec Codesign
                 Language~(BCL) which permits the designer to specify
                 the hardware-software partition in the source code,
                 allowing the compiler to synthesize efficient software
                 and hardware along with transactors for communication
                 between the partitions. The movement of functionality
                 across the hardware-software boundary is accomplished
                 by simply specifying a new partitioning, and since the
                 compiler automatically generates the desired interface
                 specifications, it eliminates yet another error-prone
                 design task. In this paper we present BCL, an extension
                 of a commercially available hardware design language
                 (Bluespec SystemVerilog), a new software compiling
                 scheme, and preliminary results generated using our
                 compiler for various hardware-software decompositions
                 of an Ogg Vorbis audio decoder, and a ray-tracing
                 application.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Martignoni:2012:PEL,
  author =       "Lorenzo Martignoni and Stephen McCamant and Pongsin
                 Poosankam and Dawn Song and Petros Maniatis",
  title =        "Path-exploration lifting: hi-fi tests for lo-fi
                 emulators",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "1",
  pages =        "337--348",
  month =        mar,
  year =         "2012",
  DOI =          "https://doi.org/10.1145/2189750.2151012",
  bibdate =      "Fri Jun 1 17:06:46 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ASPLOS '12 conference proceedings.",
  abstract =     "Processor emulators are widely used to provide
                 isolation and instrumentation of binary software.
                 However they have proved difficult to implement
                 correctly: processor specifications have many corner
                 cases that are not exercised by common workloads. It is
                 untenable to base other system security properties on
                 the correctness of emulators that have received only
                 ad-hoc testing. To obtain emulators that are worthy of
                 the required trust, we propose a technique to explore a
                 high-fidelity emulator with symbolic execution, and
                 then lift those test cases to test a lower-fidelity
                 emulator. The high-fidelity emulator serves as a proxy
                 for the hardware specification, but we can also further
                 validate by running the tests on real hardware. We
                 implement our approach and apply it to generate about
                 610,000 test cases; for about 95\% of the instructions
                 we achieve complete path coverage. The tests reveal
                 thousands of individual differences; we analyze those
                 differences to shed light on a number of root causes,
                 such as atomicity violations and missing security
                 features.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hong:2012:GMD,
  author =       "Sungpack Hong and Hassan Chafi and Edic Sedlar and
                 Kunle Olukotun",
  title =        "{Green-Marl}: a {DSL} for easy and efficient graph
                 analysis",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "1",
  pages =        "349--362",
  month =        mar,
  year =         "2012",
  DOI =          "https://doi.org/10.1145/2189750.2151013",
  bibdate =      "Fri Jun 1 17:06:46 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ASPLOS '12 conference proceedings.",
  abstract =     "The increasing importance of graph-data based
                 applications is fueling the need for highly efficient
                 and parallel implementations of graph analysis
                 software. In this paper we describe Green-Marl, a
                 domain-specific language (DSL) whose high level
                 language constructs allow developers to describe their
                 graph analysis algorithms intuitively, but expose the
                 data-level parallelism inherent in the algorithms. We
                 also present our Green-Marl compiler which translates
                 high-level algorithmic description written in
                 Green-Marl into an efficient C++ implementation by
                 exploiting this exposed data-level parallelism.
                 Furthermore, our Green-Marl compiler applies a set of
                 optimizations that take advantage of the high-level
                 semantic knowledge encoded in the Green-Marl DSL. We
                 demonstrate that graph analysis algorithms can be
                 written very intuitively with Green-Marl through some
                 examples, and our experimental results show that the
                 compiler-generated implementation out of such
                 descriptions performs as well as or better than
                 highly-tuned hand-coded implementations.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Park:2012:SDE,
  author =       "Yongjun Park and Sangwon Seo and Hyunchul Park and
                 Hyoun Kyu Cho and Scott Mahlke",
  title =        "{SIMD} defragmenter: efficient {ILP} realization on
                 data-parallel architectures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "1",
  pages =        "363--374",
  month =        mar,
  year =         "2012",
  DOI =          "https://doi.org/10.1145/2189750.2151014",
  bibdate =      "Fri Jun 1 17:06:46 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ASPLOS '12 conference proceedings.",
  abstract =     "Single-instruction multiple-data (SIMD) accelerators
                 provide an energy-efficient platform to scale the
                 performance of mobile systems while still retaining
                 post-programmability. The central challenge is
                 translating the parallel resources of the SIMD hardware
                 into real application performance. In scientific
                 applications, automatic vectorization techniques have
                 proven quite effective at extracting large levels of
                 data-level parallelism (DLP). However, vectorization is
                 often much less effective for media applications due to
                 low trip count loops, complex control flow, and
                 non-uniform execution behavior. As a result, SIMD lanes
                 remain idle due to insufficient DLP. To attack this
                 problem, this paper proposes a new vectorization pass
                 called SIMD Defragmenter to uncover hidden DLP that
                 lurks below the surface in the form of
                 instruction-level parallelism (ILP). The difficulty is
                 managing the data packing/unpacking overhead that can
                 easily exceed the benefits gained through SIMD
                 execution. The SIMD degragmenter overcomes this problem
                 by identifying groups of compatible instructions
                 (subgraphs) that can be executed in parallel across the
                 SIMD lanes. By SIMDizing in bulk at the subgraph level,
                 packing/unpacking overhead is minimized. On a 16-lane
                 SIMD processor, experimental results show that SIMD
                 defragmentation achieves a mean 1.6x speedup over
                 traditional loop vectorization and a 31\% gain over
                 prior research approaches for converting ILP to DLP.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Simha:2012:UAS,
  author =       "Dilip Nijagal Simha and Maohua Lu and Tzi-cker
                 Chiueh",
  title =        "An update-aware storage system for low-locality
                 update-intensive workloads",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "1",
  pages =        "375--386",
  month =        mar,
  year =         "2012",
  DOI =          "https://doi.org/10.1145/2189750.2151016",
  bibdate =      "Fri Jun 1 17:06:46 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ASPLOS '12 conference proceedings.",
  abstract =     "Traditional storage systems provide a simple
                 read/write interface, which is inadequate for
                 low-locality update-intensive workloads because it
                 limits the disk scheduling flexibility and results in
                 inefficient use of buffer memory and raw disk
                 bandwidth. This paper describes an update-aware disk
                 access interface that allows applications to explicitly
                 specify disk update requests and associate with such
                 requests call-back functions that will be invoked when
                 the requested disk blocks are brought into memory.
                 Because call-back functions offer a continuation
                 mechanism after retrieval of requested blocks, storage
                 systems supporting this interface are given more
                 flexibility in scheduling pending disk update requests.
                 In particular, this interface enables a simple but
                 effective technique called Batching mOdifications with
                 Sequential Commit (BOSC), which greatly improves the
                 sustained throughput of a storage system under
                 low-locality update-intensive workloads. In addition,
                 together with a space-efficient low-latency disk
                 logging technique, BOSC is able to deliver the same
                 durability guarantee as synchronous disk updates.
                 Empirical measurements show that the random update
                 throughput of a BOSC-based B+ tree is more than an
                 order of magnitude higher than that of the same B+ tree
                 implementation on a traditional storage system.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Caulfield:2012:PSU,
  author =       "Adrian M. Caulfield and Todor I. Mollov and Louis Alex
                 Eisner and Arup De and Joel Coburn and Steven Swanson",
  title =        "Providing safe, user space access to fast, solid state
                 disks",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "1",
  pages =        "387--400",
  month =        mar,
  year =         "2012",
  DOI =          "https://doi.org/10.1145/2189750.2151017",
  bibdate =      "Fri Jun 1 17:06:46 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ASPLOS '12 conference proceedings.",
  abstract =     "Emerging fast, non-volatile memories (e.g., phase
                 change memories, spin-torque MRAMs, and the memristor)
                 reduce storage access latencies by an order of
                 magnitude compared to state-of-the-art flash-based
                 SSDs. This improved performance means that software
                 overheads that had little impact on the performance of
                 flash-based systems can present serious bottlenecks in
                 systems that incorporate these new technologies. We
                 describe a novel storage hardware and software
                 architecture that nearly eliminates two sources of this
                 overhead: Entering the kernel and performing file
                 system permission checks. The new architecture provides
                 a private, virtualized interface for each process and
                 moves file system protection checks into hardware. As a
                 result, applications can access file data without
                 operating system intervention, eliminating OS and file
                 system costs entirely for most accesses. We describe
                 the support the system provides for fast permission
                 checks in hardware, our approach to notifying
                 applications when requests complete, and the small,
                 easily portable changes required in the file system to
                 support the new access model. Existing applications
                 require no modification to use the new interface. We
                 evaluate the performance of the system using a suite of
                 microbenchmarks and database workloads and show that
                 the new interface improves latency and bandwidth for 4
                 KB writes by 60\% and 7.2x, respectively, OLTP database
                 transaction throughput by up to 2.0x, and Berkeley-DB
                 throughput by up to 5.7x. A streamlined asynchronous
                 file IO interface built to fully utilize the new
                 interface enables an additional 5.5x increase in
                 throughput with 1 thread and 2.8x increase in
                 efficiency for 512 B transfers.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Narayanan:2012:WSP,
  author =       "Dushyanth Narayanan and Orion Hodson",
  title =        "Whole-system persistence",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "1",
  pages =        "401--410",
  month =        mar,
  year =         "2012",
  DOI =          "https://doi.org/10.1145/2189750.2151018",
  bibdate =      "Fri Jun 1 17:06:46 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ASPLOS '12 conference proceedings.",
  abstract =     "Today's databases and key-value stores commonly keep
                 all their data in main memory. A single server can have
                 over 100 GB of memory, and a cluster of such servers
                 can have 10s to 100s of TB. However, a storage back end
                 is still required for recovery from failures. Recovery
                 can last for minutes for a single server or hours for a
                 whole cluster, causing heavy load on the back end.
                 Non-volatile main memory (NVRAM) technologies can help
                 by allowing near-instantaneous recovery of in-memory
                 state. However, today's software does not support this
                 well. Block-based approaches such as persistent buffer
                 caches suffer from data duplication and block transfer
                 overheads. Recently, user-level persistent heaps have
                 been shown to have much better performance than these.
                 However they require substantial application
                 modification and still have significant runtime
                 overheads. This paper proposes whole-system persistence
                 (WSP) as an alternative. WSP is aimed at systems where
                 all memory is non-volatile. It transparently recovers
                 an application's entire state, making a failure appear
                 as a suspend/resume event. Runtime overheads are
                 eliminated by using ``flush on fail'': transient state
                 in processor registers and caches is flushed to NVRAM
                 only on failure, using the residual energy from the
                 system power supply. Our evaluation shows that this
                 approach has 1.6--13 times better runtime performance
                 than a persistent heap, and that flush-on-fail can
                 complete safely within 2--35\\% of the residual energy
                 window provided by standard power supplies.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Gordon:2012:EBM,
  author =       "Abel Gordon and Nadav Amit and Nadav Har'El and Muli
                 Ben-Yehuda and Alex Landau and Assaf Schuster and Dan
                 Tsafrir",
  title =        "{ELI}: bare-metal performance for {I/O}
                 virtualization",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "1",
  pages =        "411--422",
  month =        mar,
  year =         "2012",
  DOI =          "https://doi.org/10.1145/2189750.2151020",
  bibdate =      "Fri Jun 1 17:06:46 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ASPLOS '12 conference proceedings.",
  abstract =     "Direct device assignment enhances the performance of
                 guest virtual machines by allowing them to communicate
                 with I/O devices without host involvement. But even
                 with device assignment, guests are still unable to
                 approach bare-metal performance, because the host
                 intercepts all interrupts, including those interrupts
                 generated by assigned devices to signal to guests the
                 completion of their I/O requests. The host involvement
                 induces multiple unwarranted guest/host context
                 switches, which significantly hamper the performance of
                 I/O intensive workloads. To solve this problem, we
                 present ELI (ExitLess Interrupts), a software-only
                 approach for handling interrupts within guest virtual
                 machines directly and securely. By removing the host
                 from the interrupt handling path, ELI manages to
                 improve the throughput and latency of unmodified,
                 untrusted guests by 1.3x-1.6x, allowing them to reach
                 97\%-100\% of bare-metal performance even for the most
                 demanding I/O-intensive workloads.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Vasic:2012:DAR,
  author =       "Nedeljko Vasi{\'c} and Dejan Novakovi{\'c} and
                 Svetozar Miucin and Dejan Kosti{\'c} and Ricardo
                 Bianchini",
  title =        "{DejaVu}: accelerating resource allocation in
                 virtualized environments",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "1",
  pages =        "423--436",
  month =        mar,
  year =         "2012",
  DOI =          "https://doi.org/10.1145/2189750.2151021",
  bibdate =      "Fri Jun 1 17:06:46 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ASPLOS '12 conference proceedings.",
  abstract =     "Effective resource management of virtualized
                 environments is a challenging task. State-of-the-art
                 management systems either rely on analytical models or
                 evaluate resource allocations by running actual
                 experiments. However, both approaches incur a
                 significant overhead once the workload changes. The
                 former needs to re-calibrate and re-validate models,
                 whereas the latter has to run a new set of experiments
                 to select a new resource allocation. During the
                 adaptation period, the system may run with an
                 inefficient configuration. In this paper, we propose
                 DejaVu --- a framework that (1) minimizes the resource
                 management overhead by identifying a small set of
                 workload classes for which it needs to evaluate
                 resource allocation decisions, (2) quickly adapts to
                 workload changes by classifying workloads using
                 signatures and caching their preferred resource
                 allocations at runtime, and (3) deals with interference
                 by estimating an ``interference index''. We evaluate
                 DejaVu by running representative network services on
                 Amazon EC2. DejaVu achieves more than 10x speedup in
                 adaptation time for each workload change relative to
                 the state-of-the-art. By enabling quick adaptation,
                 DejaVu saves up to 60\% of the service provisioning
                 cost. Finally, DejaVu is easily deployable as it does
                 not require any extensive instrumentation or human
                 intervention.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Szefer:2012:ASH,
  author =       "Jakub Szefer and Ruby B. Lee",
  title =        "Architectural support for hypervisor-secure
                 virtualization",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "1",
  pages =        "437--450",
  month =        mar,
  year =         "2012",
  DOI =          "https://doi.org/10.1145/2189750.2151022",
  bibdate =      "Fri Jun 1 17:06:46 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ASPLOS '12 conference proceedings.",
  abstract =     "Virtualization has become a standard part of many
                 computer systems. A key part of virtualization is the
                 all-powerful hypervisor which manages the physical
                 platform and can access all of its resources, including
                 memory assigned to the guest virtual machines (VMs).
                 Continuing releases of bug reports and exploits in the
                 virtualization software show that defending the
                 hypervisor against attacks is very difficult. In this
                 work, we present hypervisor-secure virtualization --- a
                 new research direction with the goal of protecting the
                 guest VMs from an untrusted hypervisor. We also present
                 the HyperWall architecture which achieves
                 hypervisor-secure virtualization, using hardware to
                 provide the protections. HyperWall allows a hypervisor
                 to freely manage the memory, processor cores and other
                 resources of a platform. Yet once VMs are created, our
                 new Confidentiality and Integrity Protection (CIP)
                 tables protect the memory of the guest VMs from
                 accesses by the hypervisor or by DMA, depending on the
                 customer's specification. If a hypervisor does become
                 compromised, e.g. by an attack from a malicious VM, it
                 cannot be used in turn to attack other VMs. The
                 protections are enabled through minimal modifications
                 to the microprocessor and memory management units.
                 Whereas much of the previous work concentrates on
                 protecting the hypervisor from attacks by guest VMs, we
                 tackle the problem of protecting the guest VMs from the
                 hypervisor.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Lee:2012:RSE,
  author =       "Min Lee and Karsten Schwan",
  title =        "Region scheduling: efficiently using the cache
                 architectures via page-level affinity",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "1",
  pages =        "451--462",
  month =        mar,
  year =         "2012",
  DOI =          "https://doi.org/10.1145/2189750.2151023",
  bibdate =      "Fri Jun 1 17:06:46 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ASPLOS '12 conference proceedings.",
  abstract =     "The performance of modern many-core platforms strongly
                 depends on the effectiveness of using their complex
                 cache and memory structures. This indicates the need
                 for a memory-centric approach to platform scheduling,
                 in which it is the locations of memory blocks in caches
                 rather than CPU idleness that determines where
                 application processes are run. Using the term `memory
                 region' to denote the current set of physical memory
                 pages actively used by an application, this paper
                 presents and evaluates region-based scheduling methods
                 for multicore platforms. This involves (i) continuously
                 and at runtime identifying the memory regions used by
                 executable entities, and their sizes, (ii) mapping
                 these regions to caches to match performance goals, and
                 (iii) maintaining region to cache mappings by ensuring
                 that entities run on processors with direct access to
                 the caches containing their regions. Region scheduling
                 can implement policies that (i) offer improved
                 performance to applications by `unifying' the multiple
                 caches present on the underlying physical machine
                 and/or by `balancing' cache usage to take maximum
                 advantage of available cache space, (ii) better isolate
                 applications from each other, particularly when their
                 performance is strongly affected by cache availability,
                 and also (iii) take advantage of standard scheduling
                 and CPU-based load balancing when regioning is
                 ineffective. The paper describes region scheduling and
                 its system-level implementation and evaluates its
                 performance with micro-benchmarks and representative
                 multi-core applications. Single applications see
                 performance improvements of up to 15\% with region
                 scheduling, and we observe 40\% latency improvements
                 when a platform is shared by multiple applications.
                 Superior isolation is shown to be particularly
                 important for cache-sensitive or real-time codes.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Juurlink:2012:ALP,
  author =       "B. H. H. Juurlink and C. H. Meenderinck",
  title =        "{Amdahl}'s law for predicting the future of multicores
                 considered harmful",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "2",
  pages =        "1--9",
  month =        may,
  year =         "2012",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2234336.2234338",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Jun 1 17:06:51 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Several recent works predict the future of multicore
                 systems or identify scalability bottlenecks based on
                 Amdahl's law. Amdahl's law implicitly assumes, however,
                 that the problem size stays constant, but in most cases
                 more cores are used to solve larger and more complex
                 problems. There is a related law known as Gustafson's
                 law which assumes that runtime, not the problem size,
                 is constant. In other words, it is assumed that the
                 runtime on p cores is the same as the runtime on 1 core
                 and that the parallel part of an application scales
                 linearly with the number of cores. We apply Gustafson's
                 law to symmetric, asymmetric, and dynamic multicores
                 and show that this leads to fundamentally different
                 results than when Amdahl's law is applied. We also
                 generalize Amdahl's and Gustafson's law and study how
                 this quantitatively effects the dimensioning of future
                 multicore systems.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Mueller:2012:ABA,
  author =       "Conrad Mueller",
  title =        "Axiom based architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "2",
  pages =        "10--17",
  month =        may,
  year =         "2012",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2234336.2234339",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Jun 1 17:06:51 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "The paper proposes an axiom based architecture as an
                 alternative to the von Neumann model. The model has
                 many desirable properties: fine-grained parallelism,
                 simple semantics, better security and easy of
                 programming. The empirical research gives some
                 indication of its performance potential. A description
                 is given as to how algebraic arithmetic expressions of
                 relations can be broken up into primitive expressions
                 consisting of a single operation. These primitive
                 relations are shown to be sufficient to describe a
                 Turing machine. Eight inference rules are given that
                 define how the primitive relations can be evaluated. An
                 outline is given of an architecture based on these
                 inference rules. Finally a brief description is given
                 of an experimental emulation and empirical evaluation
                 of the architecture. Instead of manipulating data or
                 values by applying instructions or functions,
                 computation is applying existing elements to relations
                 to create new elements. The element's identifier
                 determines which relations the element applies to. The
                 relation determines the identifier of the new element
                 and the operation that needs to be applied to create
                 the value of the new element. The conceptually indices
                 are different in this model. Instead of seeing an index
                 as an offset into an array, an index is seen as part of
                 the element identifier. This enables infinitely many
                 relations to be defined between unique sets using
                 universal quantifiers. Thus every element, or value,
                 computed has a unique description.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Thomasian:2012:RPR,
  author =       "Alexander Thomasian",
  title =        "Rebuild processing in {RAID5} with emphasis on the
                 supplementary parity augmentation method",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "2",
  pages =        "18--27",
  month =        may,
  year =         "2012",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2234336.2234340",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Jun 1 17:06:51 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "The rotated parity RAID5 disk array tolerates single
                 disk failures by continuing operation by on-demand
                 reconstruction of data blocks of the failed disk, until
                 the systematic reconstruction of the contents of the
                 failed disk is completed by the rebuild process on a
                 spare disk. Supplementary Parity Augmentation (SPA),
                 unlike the pyramid code, which has two parities
                 covering half of the arrays disks each, extends RAID5's
                 P parity with an additional S parity, which covers half
                 of the disks. The extra load with respect to RAID5 of
                 updating the S parity by one half of the disks is
                 compensated by the more efficient on demand
                 reconstruction and rebuild processing when a disk
                 fails. Although SPA has the same disk space redundancy
                 level as RAID6, unlike RAID6 it can only deal with
                 roughly half of all possible double disk failure cases
                 for eight disks. For rebuild processing SPA reads half
                 of the disks required by RAID5 and this leads to a
                 higher Mean Time to Data Loss than RAID5, since fewer
                 Latent Sector Errors are encountered. We review
                 performance and reliability modeling of RAID5 arrays to
                 provide insights into SPA's performance and
                 reliability, which cannot be gained from numerical
                 results alone. SPA is outperformed by the Intra-Disk
                 Redundancy schemes combined with RAID5, which results
                 in RAID6's reliability and RAID5 performance.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Giri:2012:FIN,
  author =       "Nishant Kumar Giri and Amitabha Sinha",
  title =        "{FPGA} implementation of a novel architecture for
                 performance enhancement of Radix-2 {FFT}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "2",
  pages =        "28--32",
  month =        may,
  year =         "2012",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2234336.2234341",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Jun 1 17:06:51 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "This paper presents a novel architecture for the
                 enhancement of performance of compute intensive Fast
                 Fourier Transform (FFT) algorithm which is common in
                 many signal processing applications. The proposed
                 architecture exhibits faster response time compared to
                 radix-2 `Single-path Delay Feedback (SDF)' architecture
                 and `radix-2 Multi-path Delay Commutator (MDC)'
                 architecture. The architecture was simulated using
                 Modelsim and was implemented on Xilinx Virtex 4 FPGA.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ghosh:2012:NAF,
  author =       "Aniruddha Ghosh and Satrughna Singha and Amitabha
                 Sinha",
  title =        "A new architecture for {FPGA} implementation of a
                 {MAC} unit for digital signal processors using mixed
                 number system",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "2",
  pages =        "33--38",
  month =        may,
  year =         "2012",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2234336.2234342",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Jun 1 17:06:51 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Execution of arithmetic operations at very high speed
                 in real time is the major concern in digital signal
                 processing (DSP) because DSP algorithms are computation
                 intensive. In recent times, Residue Number Systems
                 (RNS) are considered as alternative to binary number
                 system because of their capabilities of performing
                 ``carry-free'' addition and Multiplication. Double Base
                 Number Systems (DBNS), another non-binary number
                 systems are also increasingly becoming attractive for
                 signal processing applications due to their
                 capabilities of handling arithmetic operations,
                 particularly multiplication efficiently. However, the
                 complexity involved in converting binary to DBNS
                 becomes a major bottleneck and the efficiency of
                 performance decreases considerably due to large
                 conversion time. So RNS Adder and DBNS Multiplier can
                 be used to implement multiply \& accumulate (MAC)
                 units. Because RNS adders are less complex and faster
                 compared to DBNS and DBNS multipliers are efficient
                 compared to RNS multiplier. MAC units are the key units
                 in Digital Signal Processors. In this paper we have
                 shown how FIR filter can be implemented using the
                 proposed ``Mixed Number System MAC units''.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ghosh:2012:FPR,
  author =       "Aniruddha Ghosh and Satrughna Singha and Amitabha
                 Sinha",
  title =        "{``Floating point RNS''}: a new concept for designing
                 the {MAC} unit of digital signal processor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "2",
  pages =        "39--43",
  month =        may,
  year =         "2012",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2234336.2234343",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Jun 1 17:06:51 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Execution of arithmetic operations at a very high
                 speed in real time is the major concern in compute
                 intensive digital signal processing (DSP) algorithms
                 Residue Number Systems are being considered as
                 alternative to binary number system because of their
                 capabilities of performing ``carry free'' arithmetic
                 operations. However, RNS systems have so far been used
                 to handle integer numbers only. Floating Point RNS
                 arithmetic units have obvious advantages over fixed
                 point multiply \& accumulate (MAC) units which are
                 the key units in Digital Signal Processors. Keeping
                 this in view, in this paper, the architecture of a
                 floating point MAC unit is presented.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Thorson:2012:INa,
  author =       "Mark Thorson",
  title =        "{Internet} nuggets",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "2",
  pages =        "44--49",
  month =        may,
  year =         "2012",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2234336.2234345",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Jun 1 17:06:51 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Liu:2012:RRA,
  author =       "Jamie Liu and Ben Jaiyen and Richard Veras and Onur
                 Mutlu",
  title =        "{RAIDR}: {Retention-Aware Intelligent DRAM Refresh}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "3",
  pages =        "1--12",
  month =        jun,
  year =         "2012",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2366231.2337161",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 6 10:21:07 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ISCA '12 conference proceedings.",
  abstract =     "Dynamic random-access memory (DRAM) is the building
                 block of modern main memory systems. DRAM cells must be
                 periodically refreshed to prevent loss of data. These
                 refresh operations waste energy and degrade system
                 performance by interfering with memory accesses. The
                 negative effects of DRAM refresh increase as DRAM
                 device capacity increases. Existing DRAM devices
                 refresh all cells at a rate determined by the leakiest
                 cell in the device. However, most DRAM cells can retain
                 data for significantly longer. Therefore, many of these
                 refreshes are unnecessary. In this paper, we propose
                 RAIDR (Retention-Aware Intelligent DRAM Refresh), a
                 low-cost mechanism that can identify and skip
                 unnecessary refreshes using knowledge of cell retention
                 times. Our key idea is to group DRAM rows into
                 retention time bins and apply a different refresh rate
                 to each bin. As a result, rows containing leaky cells
                 are refreshed as frequently as normal, while most rows
                 are refreshed less frequently. RAIDR uses Bloom filters
                 to efficiently implement retention time bins. RAIDR
                 requires no modification to DRAM and minimal
                 modification to the memory controller. In an 8-core
                 system with 32 GB DRAM, RAIDR achieves a 74.6\% refresh
                 reduction, an average DRAM power reduction of 16.1\%,
                 and an average system performance improvement of 8.6\%
                 over existing systems, at a modest storage overhead of
                 1.25 KB in the memory controller. RAIDR's benefits are
                 robust to variation in DRAM system configuration, and
                 increase as memory capacity increases.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Bojnordi:2012:PPM,
  author =       "Mahdi Nazm Bojnordi and Engin Ipek",
  title =        "{PARDIS}: a programmable memory controller for the
                 {DDRx} interfacing standards",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "3",
  pages =        "13--24",
  month =        jun,
  year =         "2012",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2366231.2337162",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 6 10:21:07 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ISCA '12 conference proceedings.",
  abstract =     "Modern memory controllers employ sophisticated address
                 mapping, command scheduling, and power management
                 optimizations to alleviate the adverse effects of DRAM
                 timing and resource constraints on system performance.
                 A promising way of improving the versatility and
                 efficiency of these controllers is to make them
                 programmable---a proven technique that has seen wide
                 use in other control tasks ranging from DMA scheduling
                 to NAND Flash and directory control. Unfortunately, the
                 stringent latency and throughput requirements of modern
                 DDRx devices have rendered such programmability largely
                 impractical, confining DDRx controllers to
                 fixed-function hardware. This paper presents the
                 instruction set architecture (ISA) and hardware
                 implementation of PARDIS, a programmable memory
                 controller that can meet the performance requirements
                 of a high-speed DDRx interface. The proposed controller
                 is evaluated by mapping previously proposed DRAM
                 scheduling, address mapping, refresh scheduling, and
                 power management algorithms onto PARDIS. Simulation
                 results show that the average performance of PARDIS
                 comes within 8\% of fixed-function hardware for each of
                 these techniques; moreover, by enabling
                 application-specific optimizations, PARDIS improves
                 system performance by 6--17\% and reduces DRAM energy
                 by 9--22\% over four existing memory controllers.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Yoon:2012:BEM,
  author =       "Doe Hyun Yoon and Jichuan Chang and Naveen
                 Muralimanohar and Parthasarathy Ranganathan",
  title =        "{BOOM}: enabling mobile memory based low-power server
                 {DIMMs}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "3",
  pages =        "25--36",
  month =        jun,
  year =         "2012",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2366231.2337163",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 6 10:21:07 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ISCA '12 conference proceedings.",
  abstract =     "To address the real-time processing needs of large and
                 growing amounts of data, modern software increasingly
                 uses main memory as the primary data store for critical
                 information. This trend creates a new emphasis on
                 high-capacity, high-bandwidth, and high-reliability
                 main memory systems. Conventional and recently-proposed
                 server memory techniques can satisfy these
                 requirements, but at the cost of significantly
                 increased memory power, a key constraint for future
                 memory systems. In this paper, we exploit the low-power
                 nature of another high volume memory component---mobile
                 DRAM---while improving its bandwidth and reliability
                 shortcomings with a new DIMM architecture. We propose
                 Buffered Output On Module (BOOM) that buffers the data
                 outputs from multiple ranks of low-frequency mobile
                 DRAM devices, which in aggregation provide high
                 bandwidth and achieve chipkill-correct or even stronger
                 reliability. Our evaluation shows that BOOM can reduce
                 main memory power by more than 73\% relative to the
                 baseline chipkill system, while improving average
                 performance by 5\% and providing strong reliability.
                 For memory-intensive applications, BOOM can improve
                 performance by 30--40\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Malladi:2012:TEP,
  author =       "Krishna T. Malladi and Benjamin C. Lee and Frank A.
                 Nothaft and Christos Kozyrakis and Karthika
                 Periyathambi and Mark Horowitz",
  title =        "Towards energy-proportional datacenter memory with
                 mobile {DRAM}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "3",
  pages =        "37--48",
  month =        jun,
  year =         "2012",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2366231.2337164",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 6 10:21:07 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ISCA '12 conference proceedings.",
  abstract =     "To increase datacenter energy efficiency, we need
                 memory systems that keep pace with processor efficiency
                 gains. Currently, servers use DDR3 memory, which is
                 designed for high bandwidth but not for energy
                 proportionality. A system using 20\% of the peak DDR3
                 bandwidth consumes 2.3x the energy per bit compared to
                 the energy consumed by a system with fully utilized
                 memory bandwidth. Nevertheless, many datacenter
                 applications stress memory capacity and latency but not
                 memory bandwidth. In response, we architect server
                 memory systems using mobile DRAM devices, trading peak
                 bandwidth for lower energy consumption per bit and more
                 efficient idle modes. We demonstrate 3-5x lower memory
                 power, better proportionality, and negligible
                 performance penalties for datacenter workloads.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Brunie:2012:SBW,
  author =       "Nicolas Brunie and Sylvain Collange and Gregory
                 Diamos",
  title =        "Simultaneous branch and warp interweaving for
                 sustained {GPU} performance",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "3",
  pages =        "49--60",
  month =        jun,
  year =         "2012",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2366231.2337166",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 6 10:21:07 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ISCA '12 conference proceedings.",
  abstract =     "Single-Instruction Multiple-Thread (SIMT)
                 micro-architectures implemented in Graphics Processing
                 Units (GPUs) run fine-grained threads in lockstep by
                 grouping them into units, referred to as warps, to
                 amortize the cost of instruction fetch, decode and
                 control logic over multiple execution units. As
                 individual threads take divergent execution paths,
                 their processing takes place sequentially, defeating
                 part of the efficiency advantage of SIMD execution. We
                 present two complementary techniques that mitigate the
                 impact of thread divergence on SIMT
                 micro-architectures. Both techniques relax the SIMD
                 execution model by allowing two distinct instructions
                 to be scheduled to disjoint subsets of the the same row
                 of execution units, instead of one single instruction.
                 They increase flexibility by providing more thread
                 grouping opportunities than SIMD, while preserving the
                 affinity between threads to avoid introducing extra
                 memory divergence. We consider (1) co-issuing
                 instructions from different divergent paths of the same
                 warp and (2) co-issuing instructions from different
                 warps. To support (1), we introduce a novel thread
                 reconvergence technique that ensures threads are run
                 back in lockstep at control-flow reconvergence points
                 without hindering their ability to run branches in
                 parallel. We propose a lane shuffling technique to
                 allow solution (2) to benefit from inter-warp
                 correlations in divergence patterns. The combination of
                 all these techniques improves performance by 23\% on a
                 set of regular GPGPU applications and by 40\% on
                 irregular applications, while maintaining the same
                 instruction-fetch and processing-unit resource
                 requirements as the contemporary Fermi GPU
                 architecture.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Rhu:2012:CPC,
  author =       "Minsoo Rhu and Mattan Erez",
  title =        "{CAPRI}: prediction of compaction-adequacy for
                 handling control-divergence in {GPGPU} architectures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "3",
  pages =        "61--71",
  month =        jun,
  year =         "2012",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2366231.2337167",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 6 10:21:07 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ISCA '12 conference proceedings.",
  abstract =     "Wide SIMD-based GPUs have evolved into a promising
                 platform for running general purpose workloads. Current
                 programmable GPUs allow even code with irregular
                 control to execute well on their SIMD pipelines. To do
                 this, each SIMD lane is considered to execute a logical
                 thread where hardware ensures that control flow is
                 accurate by automatically applying masked execution.
                 The masked execution, however, often degrades
                 performance because the issue slots of masked lanes are
                 wasted. This degradation can be mitigated by
                 dynamically compacting multiple unmasked threads into a
                 single SIMD unit. This paper proposes a fundamentally
                 new approach to branch compaction that avoids the
                 unnecessary synchronization required by previous
                 techniques and that only stalls threads that are likely
                 to benefit from compaction. Our technique is based on
                 the compaction-adequacy predictor (CAPRI). CAPRI
                 dynamically identifies the compaction-effectiveness of
                 a branch and only stalls threads that are predicted to
                 benefit from compaction. We utilize a simple
                 single-level branch-predictor inspired structure and
                 show that this simple configuration attains a
                 prediction accuracy of 99.8\% and 86.6\% for
                 non-divergent and divergent workloads, respectively.
                 Our performance evaluation demonstrates that CAPRI
                 consistently outperforms both the baseline design that
                 never attempts compaction and prior work that stalls
                 upon all divergent branches.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Menon:2012:IES,
  author =       "Jaikrishnan Menon and Marc {De Kruijf} and Karthikeyan
                 Sankaralingam",
  title =        "{iGPU}: exception support and speculative execution on
                 {GPUs}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "3",
  pages =        "72--83",
  month =        jun,
  year =         "2012",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2366231.2337168",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 6 10:21:07 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ISCA '12 conference proceedings.",
  abstract =     "Since the introduction of fully programmable vertex
                 shader hardware, GPU computing has made tremendous
                 advances. Exception support and speculative execution
                 are the next steps to expand the scope and improve the
                 usability of GPUs. However, traditional mechanisms to
                 support exceptions and speculative execution are highly
                 intrusive to GPU hardware design. This paper builds on
                 two related insights to provide a unified lightweight
                 mechanism for supporting exceptions and speculation on
                 GPUs. First, we observe that GPU programs can be broken
                 into code regions that contain little or no live
                 register state at their entry point. We then also
                 recognize that it is simple to generate these regions
                 in such a way that they are idempotent, allowing their
                 entry points to function as program recovery points and
                 enabling support for exception handling, fast context
                 switches, and speculation, all with very low overhead.
                 We call the architecture of GPUs executing these
                 idempotent regions the iGPU architecture. The hardware
                 extensions required are minimal and the construction of
                 idempotent code regions is fully transparent under the
                 typical dynamic compilation framework of GPUs. We
                 demonstrate how iGPU exception support enables virtual
                 memory paging with very low overhead (1\% to 4\%), and
                 how speculation support enables circuit-speculation
                 techniques that can provide over 25\% reduction in
                 energy.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Arnau:2012:BMG,
  author =       "Jos{\'e}-Mar{\'\i}a Arnau and Joan-Manuel Parcerisa
                 and Polychronis Xekalakis",
  title =        "Boosting mobile {GPU} performance with a decoupled
                 access\slash execute fragment processor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "3",
  pages =        "84--93",
  month =        jun,
  year =         "2012",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2366231.2337169",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 6 10:21:07 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ISCA '12 conference proceedings.",
  abstract =     "Smartphones represent one of the fastest growing
                 markets, providing significant hardware/software
                 improvements every few months. However, supporting
                 these capabilities reduces the operating time per
                 battery charge. The CPU/GPU component is only left with
                 a shrinking fraction of the power budget, since most of
                 the energy is consumed by the screen and the antenna.
                 In this paper, we focus on improving the energy
                 efficiency of the GPU since graphical applications
                 consist an important part of the existing market.
                 Moreover, the trend towards better screens will
                 inevitably lead to a higher demand for improved
                 graphics rendering. We show that the main bottleneck
                 for these applications is the texture cache and that
                 traditional techniques for hiding memory latency
                 (prefetching, multithreading) do not work well or come
                 at a high energy cost. We thus propose the migration of
                 GPU designs towards the decoupled access-execute
                 concept. Furthermore, we significantly reduce bandwidth
                 usage in the decoupled architecture by exploiting
                 inter-core data sharing. Using commercial Android
                 applications, we show that the end design can achieve
                 93\% of the performance of a heavily multithreaded GPU
                 while providing energy savings of 34\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kayaalp:2012:BRL,
  author =       "Mehmet Kayaalp and Meltem Ozsoy and Nael Abu-Ghazaleh
                 and Dmitry Ponomarev",
  title =        "Branch regulation: low-overhead protection from code
                 reuse attacks",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "3",
  pages =        "94--105",
  month =        jun,
  year =         "2012",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2366231.2337171",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 6 10:21:07 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ISCA '12 conference proceedings.",
  abstract =     "Code reuse attacks (CRAs) are recent security exploits
                 that allow attackers to execute arbitrary code on a
                 compromised machine. CRAs, exemplified by
                 return-oriented and jump-oriented programming
                 approaches, reuse fragments of the library code, thus
                 avoiding the need for explicit injection of attack code
                 on the stack. Since the executed code is reused
                 existing code, CRAs bypass current hardware and
                 software security measures that prevent execution from
                 data or stack regions of memory. While software-based
                 full control flow integrity (CFI) checking can protect
                 against CRAs, it includes significant overhead,
                 involves non-trivial effort of constructing a control
                 flow graph, relies on proprietary tools and has
                 potential vulnerabilities due to the presence of
                 unintended branch instructions in architectures such as
                 x86---those branches are not checked by the software
                 CFI. We propose branch regulation (BR), a lightweight
                 hardware-supported protection mechanism against the
                 CRAs that addresses all limitations of software CFI. BR
                 enforces simple control flow rules in hardware at the
                 function granularity to disallow arbitrary control flow
                 transfers from one function into the middle of another
                 function. This prevents common classes of CRAs without
                 the complexity and run-time overhead of full CFI
                 enforcement. BR incurs a slowdown of about 2\% and
                 increases the code footprint by less than 1\% on the
                 average for the SPEC 2006 benchmarks.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Demme:2012:SCV,
  author =       "John Demme and Robert Martin and Adam Waksman and
                 Simha Sethumadhavan",
  title =        "Side-channel vulnerability factor: a metric for
                 measuring information leakage",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "3",
  pages =        "106--117",
  month =        jun,
  year =         "2012",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2366231.2337172",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 6 10:21:07 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ISCA '12 conference proceedings.",
  abstract =     "There have been many attacks that exploit side-effects
                 of program execution to expose secret information and
                 many proposed countermeasures to protect against these
                 attacks. However there is currently no systematic,
                 holistic methodology for understanding information
                 leakage. As a result, it is not well known how design
                 decisions affect information leakage or the
                 vulnerability of systems to side-channel attacks. In
                 this paper, we propose a metric for measuring
                 information leakage called the Side-channel
                 Vulnerability Factor (SVF). SVF is based on our
                 observation that all side-channel attacks ranging from
                 physical to microarchitectural to software rely on
                 recognizing leaked execution patterns. SVF quantifies
                 patterns in attackers' observations and measures their
                 correlation to the victim's actual execution patterns
                 and in doing so captures systems' vulnerability to
                 side-channel attacks. In a detailed case study of
                 on-chip memory systems, SVF measurements help expose
                 unexpected vulnerabilities in whole-system designs and
                 shows how designers can make performance-security
                 trade-offs. Thus, SVF provides a quantitative approach
                 to secure computer architecture.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Martin:2012:TRT,
  author =       "Robert Martin and John Demme and Simha Sethumadhavan",
  title =        "{TimeWarp}: rethinking timekeeping and performance
                 monitoring mechanisms to mitigate side-channel
                 attacks",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "3",
  pages =        "118--129",
  month =        jun,
  year =         "2012",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2366231.2337173",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 6 10:21:07 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ISCA '12 conference proceedings.",
  abstract =     "Over the past two decades, several microarchitectural
                 side channels have been exploited to create
                 sophisticated security attacks. Solutions to this
                 problem have mainly focused on fixing the source of
                 leaks either by limiting the flow of information
                 through the side channel by modifying hardware, or by
                 refactoring vulnerable software to protect sensitive
                 data from leaking. These solutions are reactive and not
                 preventative: while the modifications may protect
                 against a single attack, they do nothing to prevent
                 future side channel attacks that exploit other
                 microarchitectural side channels or exploit the same
                 side channel in a novel way. In this paper we present a
                 general mitigation strategy that focuses on the
                 infrastructure used to measure side channel leaks
                 rather than the source of leaks, and thus applies to
                 all known and unknown microarchitectural side channel
                 leaks. Our approach is to limit the fidelity of fine
                 grain timekeeping and performance counters, making it
                 difficult for an attacker to distinguish between
                 different microarchitectural events, thus thwarting
                 attacks. We demonstrate the strength of our proposed
                 security modifications, and validate that our changes
                 do not break existing software. Our proposed changes
                 require minor --- or in some cases, no --- hardware
                 modifications and do not result in any substantial
                 performance degradation, yet offer the most
                 comprehensive protection against microarchitectural
                 side channels to date.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Valamehr:2012:IRM,
  author =       "Jonathan Valamehr and Melissa Chase and Seny Kamara
                 and Andrew Putnam and Dan Shumow and Vinod
                 Vaikuntanathan and Timothy Sherwood",
  title =        "Inspection resistant memory: architectural support for
                 security from physical examination",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "3",
  pages =        "130--141",
  month =        jun,
  year =         "2012",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2366231.2337174",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 6 10:21:07 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ISCA '12 conference proceedings.",
  abstract =     "The ability to safely keep a secret in memory is
                 central to the vast majority of security schemes, but
                 storing and erasing these secrets is a difficult
                 problem in the face of an attacker who can obtain
                 unrestricted physical access to the underlying
                 hardware. Depending on the memory technology, the very
                 act of storing a 1 instead of a 0 can have physical
                 side effects measurable even after the power has been
                 cut. These effects cannot be hidden easily, and if the
                 secret stored on chip is of sufficient value, an
                 attacker may go to extraordinary means to learn even a
                 few bits of that information. Solving this problem
                 requires a new class of architectures that measurably
                 increase the difficulty of physical analysis. In this
                 paper we take a first step towards this goal by
                 focusing on one of the backbones of any hardware
                 system: on-chip memory. We examine the relationship
                 between security, area, and efficiency in these
                 architectures, and quantitatively examine the resulting
                 systems through cryptographic analysis and
                 microarchitectural impact. In the end, we are able to
                 find an efficient scheme in which, even if an adversary
                 is able to inspect the value of a stored bit with a
                 probabilistic error of only 5\%, our system will be
                 able to prevent that adversary from learning any
                 information about the original un-coded bits with
                 99.9999999999\% probability.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Xu:2012:TPV,
  author =       "Yi Xu and Jun Yang and Rami Melhem",
  title =        "Tolerating process variations in nanophotonic on-chip
                 networks",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "3",
  pages =        "142--152",
  month =        jun,
  year =         "2012",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2366231.2337176",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 6 10:21:07 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ISCA '12 conference proceedings.",
  abstract =     "Nanophontonic networks, a potential candidate for
                 future networks on-chip, have been challenged for their
                 reliability due to several device-level limitations.
                 One of the main issues is that fabrication errors
                 (a.k.a. process variations) can cause devices to
                 malfunction, rendering communication unreliable. For
                 example, microring resonator, a preferred optical
                 modulator device, may not resonate at the designated
                 wavelength under process variations (PV), leading to
                 communication errors and bandwidth loss. This paper
                 proposes a series of solutions to the wavelength
                 drifting problem of microrings and subsequent bandwidth
                 loss problem of an optical network, due to PV. The
                 objective is to maximize network bandwidth through
                 proper arrangement among microrings and wavelengths
                 with minimum power requirement. Our arrangement, called
                 ``MinTrim'', solves this problem using simple integer
                 linear programming, adding supplementary microrings and
                 allowing flexible assignment of wavelengths to network
                 nodes as long as the resulting network presents maximal
                 bandwidth. Each step is shown to improve bandwidth
                 provisioning with lower power requirement. Evaluations
                 on a sample network show that a baseline network could
                 lose more than 40\% bandwidth due to PV. Such loss can
                 be recovered by MinTrim to produce a network with
                 98.4\% working bandwidth. In addition, the power
                 required in arranging microrings is 39\% lower than the
                 baseline. Therefore, MinTrim provides an efficient
                 PV-tolerant solution to improving the reliability of
                 on-chip photonics.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Koka:2012:MAA,
  author =       "Pranay Koka and Michael O. McCracken and Herb
                 Schwetman and Chia-Hsin Owen Chen and Xuezhe Zheng and
                 Ron Ho and Kannan Raj and Ashok V. Krishnamoorthy",
  title =        "A micro-architectural analysis of switched photonic
                 multi-chip interconnects",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "3",
  pages =        "153--164",
  month =        jun,
  year =         "2012",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2366231.2337177",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 6 10:21:07 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ISCA '12 conference proceedings.",
  abstract =     "Silicon photonics is a promising technology to scale
                 offchip bandwidth in a power-efficient manner. Given
                 equivalent bandwidth, the flexibility of switched
                 networks often leads to the assumption that they
                 deliver greater performance than point-to-point
                 networks on message passing applications with low-radix
                 traffic patterns. However, when optical losses are
                 considered and total optical power is constrained, this
                 assumption no longer holds. In this paper we present a
                 power constrained method for designing photonic
                 interconnects that uses the power characteristics and
                 limits of optical switches, waveguide crossings,
                 inter-layer couplers and waveguides. We apply this
                 method to design three switched network topologies for
                 a multi-chip system. Using synthetic and HPC
                 benchmark-derived message patterns, we simulated the
                 three switched networks and a WDM point-to-point
                 network. We show that switched networks outperform
                 point-to-point networks only when the optical losses of
                 switches and inter-layer couplers losses are each 0.75
                 dB or lower; achieving this would require a major
                 breakthrough in device development. We then show that
                 this result extends to any switched network with
                 similarly complex topology, through simulations of an
                 idealized ``perfect'' network that supports 90\% of the
                 peak bandwidth under all traffic patterns. We conclude
                 that given a fixed amount of input optical power, under
                 realistic device assumptions, a point-to-point network
                 has the best performance and energy characteristics.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Carpenter:2012:EET,
  author =       "Aaron Carpenter and Jianyun Hu and Ovunc Kocabas and
                 Michael Huang and Hui Wu",
  title =        "Enhancing effective throughput for transmission
                 line-based bus",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "3",
  pages =        "165--176",
  month =        jun,
  year =         "2012",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2366231.2337178",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 6 10:21:07 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ISCA '12 conference proceedings.",
  abstract =     "Main-stream general-purpose microprocessors require a
                 collection of high-performance interconnects to supply
                 the necessary data movement. The trend of continued
                 increase in core count has prompted designs of
                 packet-switched network as a scalable solution for
                 future-generation chips. However, the cost of
                 scalability can be significant and especially hard to
                 justify for smaller-scale chips. In contrast, a
                 circuit-switched bus using transmission lines and
                 corresponding circuits offers lower latencies and much
                 lower energy costs for smaller-scale chips, making it a
                 better choice than a full-blown network-on-chip (NoC)
                 architecture. However, shared-medium designs are
                 perceived as only a niche solution for small- to
                 medium-scale chips. In this paper, we show that there
                 are many low-cost mechanisms to enhance the effective
                 throughput of a bus architecture. When a handful of
                 highly cost-effective techniques are applied, the
                 performance advantage of even the most idealistically
                 configured NoCs becomes vanishingly small. We find
                 transmission line-based buses to be a more compelling
                 interconnect even for large-scale chip-multiprocessors,
                 and thus bring into doubt the centrality of packet
                 switching in future on-chip interconnect.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Koibuchi:2012:CRS,
  author =       "Michihiro Koibuchi and Hiroki Matsutani and Hideharu
                 Amano and D. Frank Hsu and Henri Casanova",
  title =        "A case for random shortcut topologies for {HPC}
                 interconnects",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "3",
  pages =        "177--188",
  month =        jun,
  year =         "2012",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2366231.2337179",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 6 10:21:07 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ISCA '12 conference proceedings.",
  abstract =     "As the scales of parallel applications and platforms
                 increase the negative impact of communication latencies
                 on performance becomes large. Fortunately, modern High
                 Performance Computing (HPC) systems can exploit
                 low-latency topologies of high-radix switches. In this
                 context, we propose the use of random shortcut
                 topologies, which are generated by augmenting classical
                 topologies with random links. Using graph analysis we
                 find that these topologies, when compared to non-random
                 topologies of the same degree, lead to drastically
                 reduced diameter and average shortest path length. The
                 best results are obtained when adding random links to a
                 ring topology, meaning that good random shortcut
                 topologies can easily be generated for arbitrary
                 numbers of switches. Using flit-level discrete event
                 simulation we find that random shortcut topologies
                 achieve throughput comparable to and latency lower than
                 that of existing non-random topologies such as
                 hypercubes and tori. Finally, we discuss and quantify
                 practical challenges for random shortcut topologies,
                 including routing scalability and larger physical cable
                 lengths.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Nagarakatte:2012:WHS,
  author =       "Santosh Nagarakatte and Milo M. K. Martin and Steve
                 Zdancewic",
  title =        "{Watchdog}: hardware for safe and secure manual memory
                 management and full memory safety",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "3",
  pages =        "189--200",
  month =        jun,
  year =         "2012",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2366231.2337181",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 6 10:21:07 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ISCA '12 conference proceedings.",
  abstract =     "Languages such as C and C++ use unsafe manual memory
                 management, allowing simple bugs (i.e., accesses to an
                 object after deallocation) to become the root cause of
                 exploitable security vulnerabilities. This paper
                 proposes Watchdog, a hardware-based approach for
                 ensuring safe and secure manual memory management.
                 Inspired by prior software-only proposals, Watchdog
                 generates a unique identifier for each memory
                 allocation, associates these identifiers with pointers,
                 and checks to ensure that the identifier is still valid
                 on every memory access. This use of identifiers and
                 checks enables Watchdog to detect errors even in the
                 presence of reallocations. Watchdog stores these
                 pointer identifiers in a disjoint shadow space to
                 provide comprehensive protection and ensure
                 compatibility with existing code. To streamline the
                 implementation and reduce runtime overhead: Watchdog
                 (1) uses micro-ops to access metadata and perform
                 checks, (2) eliminates metadata copies among registers
                 via modified register renaming, and (3) uses a
                 dedicated metadata cache to reduce checking overhead.
                 Furthermore, this paper extends Watchdog's mechanisms
                 to detect bounds errors, thereby providing full
                 hardware-enforced memory safety at low overheads.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Devietti:2012:RAS,
  author =       "Joseph Devietti and Benjamin P. Wood and Karin Strauss
                 and Luis Ceze and Dan Grossman and Shaz Qadeer",
  title =        "{RADISH}: always-on sound and complete
                 {{\underline{Ra}ce \underline{D}etection \underline{i}n
                 \underline{S}oftware and \underline{H}ardware}}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "3",
  pages =        "201--212",
  month =        jun,
  year =         "2012",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2366231.2337182",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 6 10:21:07 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ISCA '12 conference proceedings.",
  abstract =     "Data-race freedom is a valuable safety property for
                 multithreaded programs that helps with catching bugs,
                 simplifying memory consistency model semantics, and
                 verifying and enforcing both atomicity and determinism.
                 Unfortunately, existing software-only dynamic race
                 detectors are precise but slow; proposals with hardware
                 support offer higher performance but are imprecise.
                 Both precision and performance are necessary to achieve
                 the many advantages always-on dynamic race detection
                 could provide. To resolve this trade-off, we propose
                 Radish, a hybrid hardware-software dynamic race
                 detector that is always-on and fully precise. In
                 Radish, hardware caches a principled subset of the
                 metadata necessary for race detection; this subset
                 allows the vast majority of race checks to occur
                 completely in hardware. A flexible software layer
                 handles persistence of race detection metadata on cache
                 evictions and occasional queries to this expanded set
                 of metadata. We show that Radish is correct by proving
                 equivalence to a conventional happens-before race
                 detector. Our design has modest hardware complexity:
                 caches are completely unmodified and we piggy-back on
                 existing coherence messages but do not otherwise modify
                 the protocol. Furthermore, Radish can leverage
                 type-safe languages to reduce overheads substantially.
                 Our evaluation of a simulated 8-core Radish processor
                 using PARSEC benchmarks shows runtime overheads from
                 negligible to 2x, outperforming the leading
                 software-only race detector by 2x-37x.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{VanCraeynest:2012:SHM,
  author =       "Kenzo {Van Craeynest} and Aamer Jaleel and Lieven
                 Eeckhout and Paolo Narvaez and Joel Emer",
  title =        "Scheduling heterogeneous multi-cores through
                 {Performance Impact Estimation (PIE)}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "3",
  pages =        "213--224",
  month =        jun,
  year =         "2012",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2366231.2337184",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 6 10:21:07 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ISCA '12 conference proceedings.",
  abstract =     "Single-ISA heterogeneous multi-core processors are
                 typically composed of small (e.g., in-order)
                 power-efficient cores and big (e.g., out-of-order)
                 high-performance cores. The effectiveness of
                 heterogeneous multi-cores depends on how well a
                 scheduler can map workloads onto the most appropriate
                 core type. In general, small cores can achieve good
                 performance if the workload inherently has high levels
                 of ILP. On the other hand, big cores provide good
                 performance if the workload exhibits high levels of MLP
                 or requires the ILP to be extracted dynamically. This
                 paper proposes Performance Impact Estimation (PIE) as a
                 mechanism to predict which workload-to-core mapping is
                 likely to provide the best performance. PIE collects
                 CPI stack, MLP and ILP profile information, and
                 estimates performance if the workload were to run on a
                 different core type. Dynamic PIE adjusts the scheduling
                 at runtime and thereby exploits fine-grained
                 time-varying execution behavior. We show that PIE
                 requires limited hardware support and can improve
                 system performance by an average of 5.5\% over recent
                 state-of-the-art scheduling proposals and by 8.7\% over
                 a sampling-based scheduling policy.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Cao:2012:YYP,
  author =       "Ting Cao and Stephen M. Blackburn and Tiejun Gao and
                 Kathryn S. McKinley",
  title =        "The yin and yang of power and performance for
                 asymmetric hardware and managed software",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "3",
  pages =        "225--236",
  month =        jun,
  year =         "2012",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2366231.2337185",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 6 10:21:07 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ISCA '12 conference proceedings.",
  abstract =     "On the hardware side, asymmetric multicore processors
                 present software with the challenge and opportunity of
                 optimizing in two dimensions: performance and power.
                 Asymmetric multicore processors (AMP) combine
                 general-purpose big (fast, high power) cores and small
                 (slow, low power) cores to meet power constraints.
                 Realizing their energy efficiency opportunity requires
                 workloads with differentiated performance and power
                 characteristics. On the software side, managed
                 workloads written in languages such as C\#, Java,
                 JavaScript, and PHP are ubiquitous. Managed languages
                 abstract over hardware using Virtual Machine (VM)
                 services (garbage collection, interpretation, and/or
                 just-in-time compilation) that together impose
                 substantial energy and performance costs, ranging from
                 10\% to over 80\%. We show that these services manifest
                 a differentiated performance and power workload. To
                 differing degrees, they are parallel, asynchronous,
                 communicate infrequently, and are not on the
                 application?s critical path. We identify a synergy
                 between AMP and VM services that we exploit to attack
                 the 40\% average energy overhead due to VM services.
                 Using measurements and very conservative models, we
                 show that adding small cores tailored for VM services
                 should deliver, at least, improvements in performance
                 of 13\%, energy of 7\%, and performance per energy of
                 22\%. The yin of VM services is overhead, but it meets
                 the yang of small cores on an AMP. The yin of AMP is
                 exposed hardware complexity, but it meets the yang of
                 abstraction in managed languages. VM services fulfill
                 the AMP requirement for an asynchronous, non-critical,
                 differentiated, parallel, and ubiquitous workload to
                 deliver energy efficiency. Generalizing this approach
                 beyond system software to applications will require
                 substantially more software and hardware investment,
                 but these results show the potential energy efficiency
                 gains are significant.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Krimer:2012:LDI,
  author =       "Evgeni Krimer and Patrick Chiang and Mattan Erez",
  title =        "Lane decoupling for improving the timing-error
                 resiliency of wide-{SIMD} architectures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "3",
  pages =        "237--248",
  month =        jun,
  year =         "2012",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2366231.2337187",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 6 10:21:07 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ISCA '12 conference proceedings.",
  abstract =     "A significant portion of the energy dissipated in
                 modern integrated circuits is consumed by the overhead
                 associated with timing guardbands that ensure reliable
                 execution. Timing speculation, where the pipeline
                 operates at an unsafe voltage with any rare errors
                 detected and resolved by the architecture, has been
                 demonstrated to significantly improve the
                 energy-efficiency of scalar processor designs.
                 Unfortunately, applying the same timing-speculative
                 approach to wide-SIMD architectures, such as those used
                 in highly-efficient GPUs, may not provide similar
                 gains. In this work, we make two important
                 contributions. The first is a set of models describing
                 a parametrized general error probability function that
                 is based on measurements of a fabricated chip and the
                 expected efficiency benefits of timing speculation in a
                 SIMD context. The second contribution is a decoupled
                 SIMD pipeline that more effectively utilizes timing
                 speculation and recovery, when compared with a standard
                 SIMD design that uses only conventional timing
                 speculation. The proposed lane decoupling enables each
                 SIMD lane to tolerate timing errors independent of
                 other adjacent lanes, resulting in higher throughput
                 and improved scalability. We validate our models and
                 evaluate our design using a cycle-based GPU simulator,
                 describe the conditions where efficiency improvements
                 can be obtained, and explore the benefits of decoupling
                 across a wide range of parameters. Our results show
                 that timing speculation can achieve up to 10.3\%
                 improvement in efficiency.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Miller:2012:VCE,
  author =       "Timothy N. Miller and Renji Thomas and Xiang Pan and
                 Radu Teodorescu",
  title =        "{VRSync}: characterizing and eliminating
                 synchronization-induced voltage emergencies in
                 many-core processors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "3",
  pages =        "249--260",
  month =        jun,
  year =         "2012",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2366231.2337188",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 6 10:21:07 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ISCA '12 conference proceedings.",
  abstract =     "Power consumption is a primary concern for
                 microprocessor designers. Lowering the supply voltage
                 of processors is one of the most effective techniques
                 for improving their energy efficiency. Unfortunately,
                 low-voltage operation faces multiple challenges going
                 forward. One such challenge is increased sensitivity to
                 voltage fluctuations, which can trigger so-called
                 ``voltage emergencies'' that can lead to errors. These
                 fluctuations are caused by abrupt changes in power
                 demand, triggered by processor activity variation as a
                 function of workload. This paper examines the effects
                 of voltage fluctuations on future many-core processors.
                 With the increase in the number of cores in a chip, the
                 effects of chip-wide activity fluctuation --- such as
                 that caused by global synchronization in multithreaded
                 applications --- overshadow the effects of core-level
                 workload variability. Starting from this observation,
                 we developed VRSync, a novel synchronization
                 methodology that uses emergency-aware scheduling
                 policies that reduce the slope of load fluctuations,
                 eliminating emergencies. We show that VRSync is very
                 effective at eliminating emergencies, allowing voltage
                 guardbands to be significantly lowered, which reduces
                 energy consumption by an average of 33\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Doudalis:2012:EFU,
  author =       "Ioannis Doudalis and Milos Prvulovic",
  title =        "{Euripus}: a flexible unified hardware memory
                 checkpointing accelerator for bidirectional-debugging
                 and reliability",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "3",
  pages =        "261--272",
  month =        jun,
  year =         "2012",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2366231.2337190",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 6 10:21:07 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ISCA '12 conference proceedings.",
  abstract =     "Bidirectional debugging and error recovery have
                 different goals (programmer productivity and system
                 reliability, respectively), yet they both require the
                 ability to roll-back the program or the system to a
                 past state. This rollback functionality is typically
                 implemented using checkpoints that can restore the
                 system/application to a specific point in time. There
                 are several types of checkpoints, and bidirectional
                 debugging and error-recovery use them in different
                 ways. This paper presents Euripus$^1$, a flexible
                 hardware accelerator for memory checkpointing which can
                 create different combinations of checkpoints needed for
                 bidirectional debugging, error recovery, or both. In
                 particular, Euripus is the first hardware technique to
                 provide consolidation-friendly undo-logs (for
                 bidirectional debugging), to allow simultaneous
                 construction of both undo and redo logs, and to support
                 multi-level checkpointing for the needs of
                 error-recovery. Euripus incurs low performance
                 overheads ({$<$5}\% on average), improves roll-back
                 latency for bidirectional debugging by {$>$30}\%, and
                 supports rapid multi-level error recovery that allows
                 {$>$95}\% system efficiency even with very high error
                 rates.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Nair:2012:FOM,
  author =       "Arun Arvind Nair and Stijn Eyerman and Lieven Eeckhout
                 and Lizy Kurian John",
  title =        "A first-order mechanistic model for architectural
                 vulnerability factor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "3",
  pages =        "273--284",
  month =        jun,
  year =         "2012",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2366231.2337191",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 6 10:21:07 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ISCA '12 conference proceedings.",
  abstract =     "Soft error reliability has become a first-order design
                 criterion for modern microprocessors. Architectural
                 Vulnerability Factor (AVF) modeling is often used to
                 capture the probability that a radiation-induced fault
                 in a hardware structure will manifest as an error at
                 the program output. AVF estimation requires detailed
                 microarchitectural simulations which are time-consuming
                 and typically present aggregate metrics. Moreover, it
                 requires a large number of simulations to derive
                 insight into the impact of microarchitectural events on
                 AVF. In this work we present a first-order mechanistic
                 analytical model for computing AVF by estimating the
                 occupancy of correct-path state in important
                 microarchitecture structures through inexpensive
                 profiling. We show that the model estimates the AVF for
                 the reorder buffer, issue queue, load and store queue,
                 and functional units in a 4-wide issue machine with a
                 mean absolute error of less than 0.07. The model is
                 constructed from the first principles of out-of-order
                 processor execution in order to provide novel insight
                 into the interaction of the workload with the
                 microarchitecture to determine AVF. We demonstrate that
                 the model can be used to perform design space
                 explorations to understand trade-offs between soft
                 error rate and performance, to study the impact of
                 scaling of microarchitectural structures on AVF and
                 performance, and to characterize workloads for AVF.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Udipi:2012:LEL,
  author =       "Aniruddha N. Udipi and Naveen Muralimanohar and Rajeev
                 Balsubramonian and Al Davis and Norman P. Jouppi",
  title =        "{LOT-ECC}: localized and tiered reliability mechanisms
                 for commodity memory systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "3",
  pages =        "285--296",
  month =        jun,
  year =         "2012",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2366231.2337192",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 6 10:21:07 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ISCA '12 conference proceedings.",
  abstract =     "Memory system reliability is a serious and growing
                 concern in modern servers. Existing chipkill-level
                 memory protection mechanisms suffer from several
                 drawbacks. They activate a large number of chips on
                 every memory access --- this increases energy
                 consumption, and reduces performance due to the
                 reduction in rank-level parallelism. Additionally, they
                 increase access granularity, resulting in wasted
                 bandwidth in the absence of sufficient access locality.
                 They also restrict systems to use narrow-I/O x4
                 devices, which are known to be less energy-efficient
                 than the wider x8 DRAM devices. In this paper, we
                 present LOT-ECC, a localized and multi-tiered
                 protection scheme that attempts to solve these
                 problems. We separate error detection and error
                 correction functionality, and employ simple checksum
                 and parity codes effectively to provide strong
                 fault-tolerance, while simultaneously simplifying
                 implementation. Data and codes are localized to the
                 same DRAM row to improve access efficiency. We use
                 system firmware to store correction codes in DRAM data
                 memory and modify the memory controller to handle data
                 mapping. We thus build an effective fault-tolerance
                 mechanism that provides strong reliability guarantees,
                 activates as few chips as possible (reducing power
                 consumption by up to 44.8\% and reducing latency by up
                 to 46.9\%), and reduces circuit complexity, all while
                 working with commodity DRAMs and operating systems.
                 Finally, we propose the novel concept of a
                 heterogeneous DIMM that enables the extension of
                 LOT-ECC to x16 and wider DRAM parts.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Basu:2012:RMR,
  author =       "Arkaprava Basu and Mark D. Hill and Michael M. Swift",
  title =        "Reducing memory reference energy with opportunistic
                 virtual caching",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "3",
  pages =        "297--308",
  month =        jun,
  year =         "2012",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2366231.2337194",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 6 10:21:07 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ISCA '12 conference proceedings.",
  abstract =     "Most modern cores perform a highly-associative
                 transaction look aside buffer (TLB) lookup on every
                 memory access. These designs often hide the TLB lookup
                 latency by overlapping it with L1 cache access, but
                 this overlap does not hide the power dissipated by TLB
                 lookups. It can even exacerbate the power dissipation
                 by requiring higher associativity L1 cache. With
                 today's concern for power dissipation, designs could
                 instead adopt a virtual L1 cache, wherein TLB access
                 power is dissipated only after L1 cache misses.
                 Unfortunately, virtual caches have compatibility
                 issues, such as supporting writeable synonyms and x86's
                 physical page table walker. This work proposes an
                 Opportunistic Virtual Cache (OVC) that exposes virtual
                 caching as a dynamic optimization by allowing some
                 memory blocks to be cached with virtual addresses and
                 others with physical addresses. OVC relies on small OS
                 changes to signal which pages can use virtual caching
                 (e.g., no writeable synonyms), but defaults to physical
                 caching for compatibility. We show OVC's promise with
                 analysis that finds virtual cache problems exist, but
                 are dynamically rare. We change 240 lines in Linux
                 2.6.28 to enable OVC. On experiments with Parsec and
                 commercial workloads, the resulting system saves
                 94-99\% of TLB lookup energy and nearly 23\% of L1
                 cache dynamic lookup energy.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Wang:2012:IWE,
  author =       "Zhe Wang and Samira M. Khan and Daniel A.
                 Jim{\'e}nez",
  title =        "Improving writeback efficiency with decoupled
                 last-write prediction",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "3",
  pages =        "309--320",
  month =        jun,
  year =         "2012",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2366231.2337195",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 6 10:21:07 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ISCA '12 conference proceedings.",
  abstract =     "In modern DDRx memory systems, memory write requests
                 compete with read requests for available memory
                 resources, significantly increasing the average read
                 request service time. Caches are used to mitigate long
                 memory read latency that limits system performance.
                 Dirty blocks in the last-level cache (LLC) that will
                 not be written again before they are evicted will
                 eventually be written back to memory. We refer to these
                 blocks as last-write blocks. In this paper, we propose
                 an LLC writeback technique that improves DRAM
                 efficiency by scheduling predicted last-write blocks
                 early. We propose a low overhead last-write predictor
                 for the LLC. The predicted last-write blocks are made
                 available to the memory controller for scheduling. This
                 technique effectively re-distributes the memory
                 requests and expands writes scheduling opportunities,
                 allowing writes to be serviced efficiently by DRAM. The
                 technique is flexible enough to be applied to any LLC
                 replacement policy. Our evaluation with
                 multi-programmed workloads shows that the technique
                 significantly improves performance by 6.5\%-11.4\% on
                 average over the traditional writeback technique in an
                 eight-core processor with various DRAM configurations
                 running memory intensive benchmarks.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Sim:2012:FBC,
  author =       "Jaewoong Sim and Jaekyu Lee and Moinuddin K. Qureshi
                 and Hyesoon Kim",
  title =        "{FLEXclusion}: balancing cache capacity and on-chip
                 bandwidth via flexible exclusion",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "3",
  pages =        "321--332",
  month =        jun,
  year =         "2012",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2366231.2337196",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 6 10:21:07 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ISCA '12 conference proceedings.",
  abstract =     "Exclusive last-level caches (LLCs) reduce memory
                 accesses by effectively utilizing cache capacity.
                 However, they require excessive on-chip bandwidth to
                 support frequent insertions of cache lines on eviction
                 from upper-level caches. Non-inclusive caches, on the
                 other hand, have the advantage of using the on-chip
                 bandwidth more effectively but suffer from a higher
                 miss rate. Traditionally, the decision to use the cache
                 as exclusive or non-inclusive is made at design time.
                 However, the best option for a cache organization
                 depends on application characteristics, such as working
                 set size and the amount of traffic consumed by LLC
                 insertions. This paper proposes FLEXclusion, a design
                 that dynamically selects between exclusion and
                 non-inclusion depending on workload behavior. With
                 FLEXclusion, the cache behaves like an exclusive cache
                 when the application benefits from extra cache
                 capacity, and it acts as a non-inclusive cache when
                 additional cache capacity is not useful, so that it can
                 reduce on-chip bandwidth. FLEXclusion leverages the
                 observation that both non-inclusion and exclusion rely
                 on similar hardware support, so our proposal can be
                 implemented with negligible hardware changes. Our
                 evaluations show that a FLEXclusive cache reduces the
                 on-chip LLC insertion traffic by 72.6\% compared to an
                 exclusive design and improves performance by 5.9\%
                 compared to a non-inclusive design.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Upasani:2012:SED,
  author =       "Gaurang Upasani and Xavier Vera and Antonio
                 Gonz{\'a}lez",
  title =        "Setting an error detection infrastructure with low
                 cost acoustic wave detectors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "3",
  pages =        "333--343",
  month =        jun,
  year =         "2012",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2366231.2337198",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 6 10:21:07 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ISCA '12 conference proceedings.",
  abstract =     "The continuing decrease in dimensions and operating
                 voltage of transistors has increased their sensitivity
                 against radiation phenomena making soft errors an
                 important challenge in future chip multiprocessors
                 (CMPs). Hence, new techniques for detecting errors in
                 the logic and memories that allow meeting the desired
                 failures-in-time (FIT) budget in CMPs are required.
                 This paper proposes a low-cost dynamic particle strike
                 detection mechanism through acoustic wave detectors.
                 Our results show that our mechanism can protect both
                 the logic and the memory arrays. As a case study, we
                 also show how this technique can be combined with error
                 codes to protect the last-level cache at low cost.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Pellegrini:2012:VVP,
  author =       "Andrea Pellegrini and Joseph L. Greathouse and Valeria
                 Bertacco",
  title =        "{Viper}: virtual pipelines for enhanced reliability",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "3",
  pages =        "344--355",
  month =        jun,
  year =         "2012",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2366231.2337199",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 6 10:21:07 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ISCA '12 conference proceedings.",
  abstract =     "The reliability of future processors is threatened by
                 decreasing transistor robustness. Current architectures
                 focus on delivering high performance at low cost;
                 lifetime device reliability is a secondary concern. As
                 the rate of permanent hardware faults increases,
                 robustness will become a first class constraint for
                 even low-cost systems. Current research into reliable
                 architectures has focused on ad-hoc solutions to
                 improve designs without altering their centralized
                 control logic. Unfortunately, this centralized control
                 presents a single point of failure, which limits
                 long-term robustness. To address this issue, we
                 introduce Viper, an architecture built from a redundant
                 collection of fine-grained hardware components.
                 Instructions are perceived as customers that require a
                 sequence of services in order to properly execute. The
                 hardware components vie to perform what services they
                 can, dynamically forming virtual pipelines that avoid
                 defective hardware. This is done using distributed
                 control logic, which avoids a single point of failure
                 by construction. Viper can tolerate a high number of
                 permanent faults due to its inherent redundancy. As
                 fault counts increase, its performance degrades more
                 gracefully than traditional centralized-logic
                 architectures. We estimate that fault rates higher than
                 one permanent faults per 12 million transistors, on
                 average, cause the throughput of a classic CMP design
                 to fall below that of a Viper design of similar size.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Temam:2012:DTA,
  author =       "Olivier Temam",
  title =        "A defect-tolerant accelerator for emerging
                 high-performance applications",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "3",
  pages =        "356--367",
  month =        jun,
  year =         "2012",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2366231.2337200",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 6 10:21:07 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ISCA '12 conference proceedings.",
  abstract =     "Due to the evolution of technology constraints,
                 especially energy constraints which may lead to
                 heterogeneous multi-cores, and the increasing number of
                 defects, the design of defect-tolerant accelerators for
                 heterogeneous multi-cores may become a major
                 micro-architecture research issue. Most custom circuits
                 are highly defect sensitive, a single transistor can
                 wreck such circuits. On the contrary, artificial neural
                 networks (ANNs) are inherently error tolerant
                 algorithms. And the emergence of high-performance
                 applications implementing recognition and mining tasks,
                 for which competitive ANN-based algorithms exist,
                 drastically expands the potential application scope of
                 a hardware ANN accelerator. However, while the error
                 tolerance of ANN algorithms is well documented, there
                 are few in-depth attempts at demonstrating that an
                 actual hardware ANN would be tolerant to faulty
                 transistors. Most fault models are abstract and cannot
                 demonstrate that the error tolerance of ANN algorithms
                 can be translated into the defect tolerance of hardware
                 ANN accelerators. In this article, we introduce a
                 hardware ANN geared towards defect tolerance and energy
                 efficiency, by spatially expanding the ANN. In order to
                 precisely assess the defect tolerance capability of
                 this hardware ANN, we introduce defects at the level of
                 transistors, and then assess the impact of such defects
                 on the hardware ANN functional behavior. We empirically
                 show that the conceptual error tolerance of neural
                 networks does translate into the defect tolerance of
                 hardware neural networks, paving the way for their
                 introduction in heterogeneous multi-cores as
                 intrinsically defect-tolerant and energy-efficient
                 accelerators.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kim:2012:CES,
  author =       "Yoongu Kim and Vivek Seshadri and Donghyuk Lee and
                 Jamie Liu and Onur Mutlu",
  title =        "A case for exploiting subarray-level parallelism
                 {(SALP)} in {DRAM}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "3",
  pages =        "368--379",
  month =        jun,
  year =         "2012",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2366231.2337202",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 6 10:21:07 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ISCA '12 conference proceedings.",
  abstract =     "Modern DRAMs have multiple banks to serve multiple
                 memory requests in parallel. However, when two requests
                 go to the same bank, they have to be served serially,
                 exacerbating the high latency of off-chip memory.
                 Adding more banks to the system to mitigate this
                 problem incurs high system cost. Our goal in this work
                 is to achieve the benefits of increasing the number of
                 banks with a low cost approach. To this end, we propose
                 three new mechanisms that overlap the latencies of
                 different requests that go to the same bank. The key
                 observation exploited by our mechanisms is that a
                 modern DRAM bank is implemented as a collection of
                 subarrays that operate largely independently while
                 sharing few global peripheral structures. Our proposed
                 mechanisms (SALP-1, SALP-2, and MASA) mitigate the
                 negative impact of bank serialization by overlapping
                 different components of the bank access latencies of
                 multiple requests that go to different subarrays within
                 the same bank. SALP-1 requires no changes to the
                 existing DRAM structure and only needs reinterpretation
                 of some DRAM timing parameters. SALP-2 and MASA require
                 only modest changes ({$<$} 0.15\% area overhead) to the
                 DRAM peripheral structures, which are much less design
                 constrained than the DRAM core. Evaluations show that
                 all our schemes significantly improve performance for
                 both single-core systems and multi-core systems. Our
                 schemes also interact positively with application-aware
                 memory request scheduling in multi-core systems.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Qureshi:2012:PIP,
  author =       "Moinuddin K. Qureshi and Michele M. Franceschini and
                 Ashish Jagmohan and Luis A. Lastras",
  title =        "{PreSET}: improving performance of phase change
                 memories by exploiting asymmetry in write times",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "3",
  pages =        "380--391",
  month =        jun,
  year =         "2012",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2366231.2337203",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 6 10:21:07 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ISCA '12 conference proceedings.",
  abstract =     "Phase Change Memory (PCM) is a promising technology
                 for building future main memory systems. A prominent
                 characteristic of PCM is that it has write latency much
                 higher than read latency. Servicing such slow writes
                 causes significant contention for read requests. For
                 our baseline PCM system, the slow writes increase the
                 effective read latency by almost 2X, causing
                 significant performance degradation. This paper
                 alleviates the problem of slow writes by exploiting the
                 fundamental property of PCM devices that writes are
                 slow only in one direction (SET operation) and are
                 almost as fast as reads in the other direction (RESET
                 operation). Therefore, a write operation to a line in
                 which all memory cells have been SET prior to the
                 write, will incur much lower latency. We propose
                 PreSET, an architectural technique that leverages this
                 property to pro-actively SET all the bits in a given
                 memory line well in advance of the anticipated write to
                 that memory line. Our proposed design initiates a
                 PreSET request for a memory line as soon as that line
                 becomes dirty in the cache, thereby allowing a large
                 window of time for the PreSET operation to complete.
                 Our evaluations show that PreSET is more effective and
                 incurs lower storage overhead than previously proposed
                 write cancellation techniques. We also describe static
                 and dynamic throttling schemes to limit the rate of
                 PreSET operations. Our proposal reduces effective read
                 latency from 982 cycles to 594 cycles and increases
                 system performance by 34\%, while improving the
                 energy-delay-product by 25\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Cooper-Balis:2012:BBM,
  author =       "Elliott Cooper-Balis and Paul Rosenfeld and Bruce
                 Jacob",
  title =        "Buffer-on-board memory systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "3",
  pages =        "392--403",
  month =        jun,
  year =         "2012",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2366231.2337204",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 6 10:21:07 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ISCA '12 conference proceedings.",
  abstract =     "The design and implementation of the commodity memory
                 architecture has resulted in significant performance
                 and capacity limitations. To circumvent these
                 limitations, designers and vendors have begun to place
                 intermediate logic between the CPU and DRAM. This
                 additional logic has two functions: to control the DRAM
                 and to communicate with the CPU over a fast and narrow
                 bus. The benefit provided by this logic is a reduction
                 in pin-out to the memory system and increased signal
                 integrity to the DRAM, allowing faster clock rates
                 while maintaining capacity. While the few vendors
                 utilizing this design have used the same general
                 approach, their implementations vary greatly in their
                 nontrivial details. A hardware-verified simulation
                 suite is developed to accurately model and evaluate the
                 behavior of this buffer-onboard memory system. A study
                 of this design space is used to determine optimal use
                 of the resources involved. This includes DRAM and bus
                 organization, queue storage, and mapping schemes.
                 Various constraints based on implementation costs are
                 placed on simulated configurations to confirm that
                 these optimizations apply to viable systems. Finally,
                 full system simulations are performed to better
                 understand how this memory system interacts with an
                 operating system executing an application with the goal
                 of uncovering behaviors not present in simple limit
                 case simulations. When applying insights gleaned from
                 these simulations, optimal performance can be achieved
                 while still considering outside constraints (i.e.,
                 pin-out, power, and fabrication costs).",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Jung:2012:PAQ,
  author =       "Myoungsoo Jung and Ellis H. {Wilson III} and Mahmut
                 Kandemir",
  title =        "{Physically Addressed Queueing (PAQ)}: improving
                 parallelism in solid state disks",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "3",
  pages =        "404--415",
  month =        jun,
  year =         "2012",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2366231.2337206",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 6 10:21:07 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ISCA '12 conference proceedings.",
  abstract =     "NAND flash storage has proven to be a competitive
                 alternative to traditional disk for its properties of
                 high random-access speeds, low-power and its presumed
                 efficacy for random-reads. Ironically, we demonstrate
                 that when packaged in SSD format, there arise many
                 barriers to reaching full parallelism in reads,
                 resulting in random writes out-performing them.
                 Motivated by this, we propose Physically Addressed
                 Queuing (PAQ), a request scheduler that avoids resource
                 contention resultant from shared SSD resources. PAQ
                 makes the following major contributions: First, it
                 exposes the physical addresses of requests to the
                 scheduler. Second, I/O clumping is utilized to select
                 groups of operations that can be simultaneously
                 executed without major resource conflict. Third,
                 inter-request NAND transaction packing empowers
                 multi-plane-mode operations. We implement PAQ in a
                 cycle-accurate simulator and demonstrate bandwidth and
                 IOPS improvements greater than 62\% and latency
                 decreases as much as 41.6\% for random reads, without
                 degrading performance of other access types.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ausavarungnirun:2012:SMS,
  author =       "Rachata Ausavarungnirun and Kevin Kai-Wei Chang and
                 Lavanya Subramanian and Gabriel H. Loh and Onur Mutlu",
  title =        "Staged memory scheduling: achieving high performance
                 and scalability in heterogeneous systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "3",
  pages =        "416--427",
  month =        jun,
  year =         "2012",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2366231.2337207",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 6 10:21:07 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ISCA '12 conference proceedings.",
  abstract =     "When multiple processor (CPU) cores and a GPU
                 integrated together on the same chip share the off-chip
                 main memory, requests from the GPU can heavily
                 interfere with requests from the CPU cores, leading to
                 low system performance and starvation of CPU cores.
                 Unfortunately, state-of-the-art application-aware
                 memory scheduling algorithms are ineffective at solving
                 this problem at low complexity due to the large amount
                 of GPU traffic. A large and costly request buffer is
                 needed to provide these algorithms with enough
                 visibility across the global request stream, requiring
                 relatively complex hardware implementations. This paper
                 proposes a fundamentally new approach that decouples
                 the memory controller's three primary tasks into three
                 significantly simpler structures that together improve
                 system performance and fairness, especially in
                 integrated CPU-GPU systems. Our three-stage memory
                 controller first groups requests based on row-buffer
                 locality. This grouping allows the second stage to
                 focus only on inter-application request scheduling.
                 These two stages enforce high-level policies regarding
                 performance and fairness, and therefore the last stage
                 consists of simple per-bank FIFO queues (no further
                 command reordering within each bank) and
                 straightforward logic that deals only with low-level
                 DRAM commands and timing. We evaluate the design
                 trade-offs involved in our Staged Memory Scheduler
                 (SMS) and compare it against three state-of-the-art
                 memory controller designs. Our evaluations show that
                 SMS improves CPU performance without degrading GPU
                 frame rate beyond a generally acceptable level, while
                 being significantly less complex to implement than
                 previous application-aware schedulers. Furthermore, SMS
                 can be configured by the system software to prioritize
                 the CPU or the GPU at varying levels to address
                 different performance needs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Manikantan:2012:PSC,
  author =       "R. Manikantan and Kaushik Rajan and R. Govindarajan",
  title =        "{Probabilistic Shared Cache Management (PriSM)}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "3",
  pages =        "428--439",
  month =        jun,
  year =         "2012",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2366231.2337208",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 6 10:21:07 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ISCA '12 conference proceedings.",
  abstract =     "Effective sharing of the last level cache has a
                 significant influence on the overall performance of a
                 multicore system. We observe that existing solutions
                 control cache occupancy at a coarser granularity, do
                 not scale well to large core counts and in some cases
                 lack the flexibility to support a variety of
                 performance goals. In this paper, we propose
                 Probabilistic Shared Cache Management (PriSM), a
                 framework to manage the cache occupancy of different
                 cores at cache block granularity by controlling their
                 eviction probabilities. The proposed framework requires
                 only simple hardware changes to implement, can scale to
                 larger core count and is flexible enough to support a
                 variety of performance goals. We demonstrate the
                 flexibility of PriSM, by computing the eviction
                 probabilities needed to achieve goals like
                 hit-maximization, fairness and QOS. PriSM-HitMax
                 improves performance by 18.7\% over LRU and 11.8\% over
                 previously proposed schemes in a sixteen core machine.
                 PriSM-Fairness improves fairness over existing
                 solutions by 23.3\% along with a performance
                 improvement of 19.0\%. PriSM-QOS successfully achieves
                 the desired QOS targets.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Satish:2012:CTP,
  author =       "Nadathur Satish and Changkyu Kim and Jatin Chhugani
                 and Hideki Saito and Rakesh Krishnaiyer and Mikhail
                 Smelyanskiy and Milind Girkar and Pradeep Dubey",
  title =        "Can traditional programming bridge the {Ninja}
                 performance gap for parallel computing applications?",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "3",
  pages =        "440--451",
  month =        jun,
  year =         "2012",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2366231.2337210",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 6 10:21:07 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ISCA '12 conference proceedings.",
  abstract =     "Current processor trends of integrating more cores
                 with wider SIMD units, along with a deeper and complex
                 memory hierarchy, have made it increasingly more
                 challenging to extract performance from applications.
                 It is believed by some that traditional approaches to
                 programming do not apply to these modern processors and
                 hence radical new languages must be discovered. In this
                 paper, we question this thinking and offer evidence in
                 support of traditional programming methods and the
                 performance-vs-programming effort effectiveness of
                 common multi-core processors and upcoming many-core
                 architectures in delivering significant speedup, and
                 close-to-optimal performance for commonly used parallel
                 computing workloads. We first quantify the extent of
                 the ``Ninja gap'', which is the performance gap between
                 naively written C/C++ code that is parallelism unaware
                 (often serial) and best-optimized code on modern
                 multi-/many-core processors. Using a set of
                 representative throughput computing benchmarks, we show
                 that there is an average Ninja gap of 24X (up to 53X )
                 for a recent 6-core Intel\reg{} CoreTM i7 X980 Westmere
                 CPU, and that this gap if left unaddressed will
                 inevitably increase. We show how a set of well-known
                 algorithmic changes coupled with advancements in modern
                 compiler technology can bring down the Ninja gap to an
                 average of just 1.3X. These changes typically require
                 low programming effort, as compared to the very high
                 effort in producing Ninja code. We also discuss
                 hardware support for programmability that can reduce
                 the impact of these changes and even further increase
                 programmer productivity. We show equally encouraging
                 results for the upcoming Intel\reg{} Many Integrated
                 Core architecture (Intel\reg{} MIC) which has more
                 cores and wider SIMD. We thus demonstrate that we can
                 contain the otherwise uncontrolled growth of the Ninja
                 gap and offer a more stable and predictable performance
                 growth over future architectures, offering strong
                 evidence that radical language changes are not
                 required.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kambadur:2012:HCA,
  author =       "Melanie Kambadur and Kui Tang and Martha A. Kim",
  title =        "{Harmony}: collection and analysis of parallel block
                 vectors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "3",
  pages =        "452--463",
  month =        jun,
  year =         "2012",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2366231.2337211",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 6 10:21:07 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ISCA '12 conference proceedings.",
  abstract =     "Efficient execution of well-parallelized applications
                 is central to performance in the multicore era. Program
                 analysis tools support the hardware and software sides
                 of this effort by exposing relevant features of
                 multithreaded applications. This paper describes
                 parallel block vectors, which uncover previously unseen
                 characteristics of parallel programs. Parallel block
                 vectors provide block execution profiles per
                 concurrency phase (e.g., the block execution profile of
                 all serial regions of a program). This information
                 provides a direct and fine-grained mapping between an
                 application's runtime parallel phases and the static
                 code that makes up those phases. This paper also
                 demonstrates how to collect parallel block vectors with
                 minimal application perturbation using Harmony. Harmony
                 is an instrumentation pass for the LLVM compiler that
                 introduces just 16-21\% overhead on average across
                 eight Parsec benchmarks. We apply parallel block
                 vectors to uncover several novel insights about
                 parallel applications with direct consequences for
                 architectural design. First, that the serial and
                 parallel phases of execution used in Amdahl's Law are
                 often composed of many of the same basic blocks.
                 Second, that program features, such as instruction mix,
                 vary based on the degree of parallelism, with serial
                 phases in particular displaying different instruction
                 mixes from the program as a whole. Third, that dynamic
                 execution frequencies do not necessarily correlate with
                 a block's parallelism.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Wentzlaff:2012:CFG,
  author =       "David Wentzlaff and Christopher J. Jackson and Patrick
                 Griffin and Anant Agarwal",
  title =        "Configurable fine-grain protection for multicore
                 processor virtualization",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "3",
  pages =        "464--475",
  month =        jun,
  year =         "2012",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2366231.2337213",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 6 10:21:07 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ISCA '12 conference proceedings.",
  abstract =     "Multicore architectures, with their abundant on-chip
                 resources, are effectively collections of
                 systems-on-a-chip. The protection system for these
                 architectures must support multiple concurrently
                 executing operating systems (OSes) with different
                 needs, and manage and protect the hardware's novel
                 communication mechanisms and hardware features.
                 Traditional protection systems are insufficient; they
                 protect supervisor from user code, but typically do not
                 protect one system from another, and only support fixed
                 assignment of resources to protection levels. In this
                 paper, we propose an alternative to traditional
                 protection systems which we call configurable
                 fine-grain protection (CFP). CFP enables the dynamic
                 assignment of in-core resources to protection levels.
                 We investigate how CFP enables different system
                 software stacks to utilize the same configurable
                 protection hardware, and how differing OSes can execute
                 at the same time on a multicore processor with CFP. As
                 illustration, we describe an implementation of CFP in a
                 commercial multicore, the TILE64 processor.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ahn:2012:RHA,
  author =       "Jeongseob Ahn and Seongwook Jin and Jaehyuk Huh",
  title =        "Revisiting hardware-assisted page walks for
                 virtualized systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "3",
  pages =        "476--487",
  month =        jun,
  year =         "2012",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2366231.2337214",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 6 10:21:07 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ISCA '12 conference proceedings.",
  abstract =     "Recent improvements in architectural supports for
                 virtualization have extended traditional hardware page
                 walkers to traverse nested page tables. However,
                 current two-dimensional (2D) page walkers have been
                 designed under the assumption that the usage patterns
                 of guest and nested page tables are similar. In this
                 paper, we revisit the architectural supports for nested
                 page table walks to incorporate the unique
                 characteristics of memory management by hypervisors.
                 Unlike page tables in native systems, nested page table
                 sizes do not impose significant overheads on the
                 overall memory usage. Based on this observation, we
                 propose to use flat nested page tables to reduce
                 unnecessary memory references for nested walks. A
                 competing mechanism to HW 2D page walkers is shadow
                 paging, which duplicates guest page tables but provides
                 direct translations from guest virtual to system
                 physical addresses. However, shadow paging has been
                 suffering from the overheads of synchronization between
                 guest and shadow page tables. The second mechanism we
                 propose is a speculative shadow paging mechanism,
                 called speculative inverted shadow paging, which is
                 backed by non-speculative flat nested page tables. The
                 speculative mechanism provides a direct translation
                 with a single memory reference for common cases, and
                 eliminates the page table synchronization overheads. We
                 evaluate the proposed schemes with the real Xen
                 hypervisor running on a full system simulator. The flat
                 page tables improve a state-of-the-art 2D page walker
                 with a page walk cache and nested TLB by 7\%. The
                 speculative shadow paging improves the same 2D page
                 walker by 14\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kontorinis:2012:MDU,
  author =       "Vasileios Kontorinis and Liuyi Eric Zhang and Baris
                 Aksanli and Jack Sampson and Houman Homayoun and Eddie
                 Pettis and Dean M. Tullsen and Tajana Simunic Rosing",
  title =        "Managing distributed {UPS} energy for effective power
                 capping in data centers",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "3",
  pages =        "488--499",
  month =        jun,
  year =         "2012",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2366231.2337216",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 6 10:21:07 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ISCA '12 conference proceedings.",
  abstract =     "Power over-subscription can reduce costs for modern
                 data centers. However, designing the power
                 infrastructure for a lower operating power point than
                 the aggregated peak power of all servers requires
                 dynamic techniques to avoid high peak power costs and,
                 even worse, tripping circuit breakers. This work
                 presents an architecture for distributed per-server
                 UPSs that stores energy during low activity periods and
                 uses this energy during power spikes. This work
                 leverages the distributed nature of the UPS batteries
                 and develops policies that prolong the duration of
                 their usage. The specific approach shaves 19.4\% of the
                 peak power for modern servers, at no cost in
                 performance, allowing the installation of 24\% more
                 servers within the same power budget. More servers
                 amortize infrastructure costs better and, hence, reduce
                 total cost of ownership per server by 6.3\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Lotfi-Kamran:2012:SP,
  author =       "Pejman Lotfi-Kamran and Boris Grot and Michael Ferdman
                 and Stavros Volos and Onur Kocberber and Javier Picorel
                 and Almutaz Adileh and Djordje Jevdjic and Sachin
                 Idgunji and Emre Ozer and Babak Falsafi",
  title =        "Scale-out processors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "3",
  pages =        "500--511",
  month =        jun,
  year =         "2012",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2366231.2337217",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 6 10:21:07 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ISCA '12 conference proceedings.",
  abstract =     "Scale-out datacenters mandate high per-server
                 throughput to get the maximum benefit from the large
                 TCO investment. Emerging applications (e.g., data
                 serving and web search) that run in these datacenters
                 operate on vast datasets that are not accommodated by
                 on-die caches of existing server chips. Large caches
                 reduce the die area available for cores and lower
                 performance through long access latency when
                 instructions are fetched. Performance on scale-out
                 workloads is maximized through a modestly-sized
                 last-level cache that captures the instruction
                 footprint at the lowest possible access latency. In
                 this work, we introduce a methodology for designing
                 scalable and efficient scale-out server processors.
                 Based on a metric of performance-density, we facilitate
                 the design of optimal multi-core configurations, called
                 pods. Each pod is a complete server that tightly
                 couples a number of cores to a small last-level cache
                 using a fast interconnect. Replicating the pod to fill
                 the die area yields processors which have optimal
                 performance density, leading to maximum per-chip
                 throughput. Moreover, as each pod is a stand-alone
                 server, scale-out processors avoid the expense of
                 global (i.e., inter-pod) interconnect and coherence.
                 These features synergistically maximize throughput,
                 lower design complexity, and improve technology
                 scalability. In 20nm technology, scale-out chips
                 improve throughput by 5x-6.5x over conventional and by
                 1.6x-1.9x over emerging tiled organizations.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Li:2012:ICO,
  author =       "Chao Li and Amer Qouneh and Tao Li",
  title =        "{iSwitch}: coordinating and optimizing renewable
                 energy powered server clusters",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "3",
  pages =        "512--523",
  month =        jun,
  year =         "2012",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2366231.2337218",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 6 10:21:07 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ISCA '12 conference proceedings.",
  abstract =     "Large-scale computing systems such as data centers are
                 facing increasing pressure to cap their carbon
                 footprint. Integrating emerging clean energy solutions
                 into computer system design therefore gains great
                 significance in the green computing era. While some
                 pioneering work on tracking variable power budget show
                 promising energy efficiency, they are not suitable for
                 data centers due to lack of performance guarantee when
                 renewable generation is low and fluctuant. In addition,
                 our characterization of wind power behavior reveals
                 that data centers designed to track the intermittent
                 renewable power incur up to 4X performance loss due to
                 inefficient and redundant load matching activities. As
                 a result, mitigating operational overhead while still
                 maintaining desired energy utilization becomes the most
                 significant challenge in managing server clusters on
                 intermittent renewable energy generation. In this paper
                 we take a first step in digging into the operational
                 overhead of renewable energy powered data center. We
                 propose iSwitch, a lightweight server power management
                 that follows renewable power variation characteristics,
                 leverages existing system infrastructures, and applies
                 supply/load cooperative scheme to mitigate the
                 performance overhead. Comparing with state-of-the-art
                 renewable energy driven system design, iSwitch could
                 mitigate average network traffic by 75\%, peak network
                 traffic by 95\%, and reduce 80\% job waiting time while
                 still maintaining 96\% renewable energy utilization. We
                 expect that our work can help computer architects make
                 informed decisions on sustainable and high-performance
                 system design.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Singh:2012:EES,
  author =       "Abhayendra Singh and Satish Narayanasamy and Daniel
                 Marino and Todd Millstein and Madanlal Musuvathi",
  title =        "End-to-end sequential consistency",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "3",
  pages =        "524--535",
  month =        jun,
  year =         "2012",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2366231.2337220",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 6 10:21:07 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ISCA '12 conference proceedings.",
  abstract =     "Sequential consistency (SC) is arguably the most
                 intuitive behavior for a shared-memory multithreaded
                 program. It is widely accepted that language-level SC
                 could significantly improve programmability of a
                 multiprocessor system. However, efficiently supporting
                 end-to-end SC remains a challenge as it requires that
                 both compiler and hardware optimizations preserve SC
                 semantics. While a recent study has shown that a
                 compiler can preserve SC semantics for a small
                 performance cost, an efficient and complexity-effective
                 SC hardware remains elusive. Past hardware solutions
                 relied on aggressive speculation techniques, which has
                 not yet been realized in a practical implementation.
                 This paper exploits the observation that hardware need
                 not enforce any memory model constraints on accesses to
                 thread-local and shared read-only locations. A
                 processor can easily determine a large fraction of
                 these safe accesses with assistance from static
                 compiler analysis and the hardware memory management
                 unit. We discuss a low-complexity hardware design that
                 exploits this information to reduce the overhead in
                 ensuring SC. Our design employs an additional unordered
                 store buffer for fast-tracking thread-local stores and
                 allowing later memory accesses to proceed without a
                 memory ordering related stall. Our experimental study
                 shows that the cost of guaranteeing end-to-end SC is
                 only 6.2\% on average when compared to a system with
                 TSO hardware executing a stock compiler's output.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Mars:2012:BDS,
  author =       "Jason Mars and Naveen Kumar",
  title =        "{BlockChop}: dynamic squash elimination for hybrid
                 processor architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "3",
  pages =        "536--547",
  month =        jun,
  year =         "2012",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2366231.2337221",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 6 10:21:07 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ISCA '12 conference proceedings.",
  abstract =     "Hybrid processors are HW/SW co-designed processors
                 that leverage blocked-execution, the execution of
                 regions of instructions as atomic blocks, to facilitate
                 aggressive speculative optimization. As we move to a
                 multicore hybrid design, fine grained conflicts for
                 shared data can violate the atomicity requirement of
                 these blocks and lead to expensive squashes and
                 rollbacks. However, as these atomic regions differ from
                 those used in checkpointing and transactional memory
                 systems, the extent of this potentially prohibitive
                 problem remains unclear, and mechanisms to mitigate
                 these squashes dynamically may be critical to enable a
                 highly per-formant multicore hybrid design. In this
                 work, we investigate how multithreaded applications,
                 both benchmark and commercial workloads, are affected
                 by squashes, and present dynamic mechanisms for
                 mitigating these squashes in hybrid processors. While
                 the current wisdom is that there is not a significant
                 number of squashes for smaller atomic regions, we
                 observe this is not the case for many multithreaded
                 workloads. With region sizes of just 200--500
                 instructions, we observe a performance degradation
                 ranging from 10\% to more than 50\% for workloads with
                 a mixture of shared reads and writes. By harnessing the
                 unique flexibility provided by the software subsystem
                 of hybrid processor design, we present BlockChop, a
                 framework for dynamically mitigating squashes on
                 multicore hybrid processors. We present a range of
                 squash handling mechanisms leveraging retrials,
                 interpretation, and retranslation, and find that
                 BlockChop is quite effective. Over the current response
                 to exceptions and squashes in a hybrid design, we are
                 able to improve the performance of benchmark and
                 commercial workloads by 1.4x and 1.2x on average for
                 large and small region sizes respectively.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Yoon:2012:DGM,
  author =       "Doe Hyun Yoon and Min Kyu Jeong and Michael Sullivan
                 and Mattan Erez",
  title =        "The dynamic granularity memory system",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "3",
  pages =        "548--559",
  month =        jun,
  year =         "2012",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2366231.2337222",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 6 10:21:07 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ISCA '12 conference proceedings.",
  abstract =     "Chip multiprocessors enable continued performance
                 scaling with increasingly many cores per chip. As the
                 throughput of computation outpaces available memory
                 bandwidth, however, the system bottleneck will shift to
                 main memory. We present a memory system, the dynamic
                 granularity memory system (DGMS), which avoids
                 unnecessary data transfers, saves power, and improves
                 system performance by dynamically changing between fine
                 and coarse-grained memory accesses. DGMS predicts
                 memory access granularities dynamically in hardware,
                 and does not require software or OS support. The
                 dynamic operation of DGMS gives it superior ease of
                 implementation and power efficiency relative to prior
                 multi-granularity memory systems, while maintaining
                 comparable levels of system performance.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Aguilera:2012:AEW,
  author =       "Marcos K. Aguilera and Dahlia Malkhi and Keith
                 Marzullo and Alessandro Panconesi and Andrzej Pelc and
                 Roger Wattenhofer",
  title =        "Announcing the {2012 Edsger W. Dijkstra Prize in
                 Distributed Computing}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "4",
  pages =        "1--2",
  month =        sep,
  year =         "2012",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2411116.2411118",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Dec 11 08:06:57 MST 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Maitra:2012:NAC,
  author =       "Subhashis Maitra and Amitabha Sinha",
  title =        "A new algorithm for computing triple-base number
                 system",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "4",
  pages =        "3--9",
  month =        sep,
  year =         "2012",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2411116.2411119",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Dec 11 08:06:57 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/fparith.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "We introduce here a generalized method a new Algorithm
                 to find Triple-Base number system and Triple-Base chain
                 and hence in turn Single Digit Triple-Base number
                 system (SDTBNS). The proposed method is not only
                 simpler and faster than the Algorithms to find
                 Double-Base number system or Double-Base chain,
                 experimentally it also returns a shorter length of
                 Triple-Base chain which in turn reduces the size of the
                 look-up-table to find out SDTBNS. The complexity
                 analysis and experimental results shows the novelty of
                 the proposed Algorithm. Moreover when the proposed
                 method is applied to find scalar multiplication in case
                 of Elliptic Curve Cryptography and coefficient
                 multiplication in case of designing digital filter, its
                 efficiency also proves its novelty. Here we have used
                 third base as $5$ because when it is multiplied by $2$
                 gives $ 10$ which can be efficiently used for decimal
                 shifting, i.e. if an integer '$n$' can be represented
                 in SDTBNS form, then $ n / 10_x$ or $ n \times 10_x$
                 can also represented in SDTBNS only by diving or
                 multiplying '$n$' by $ 10$.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kumar:2012:NLT,
  author =       "Shiv Kumar and Seshadri Krishna Murthy and G.
                 Varaprasad and S. Sivasathya",
  title =        "Network load and traffic pattern on the capacity of
                 wireless ad hoc networks",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "4",
  pages =        "10--25",
  month =        sep,
  year =         "2012",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2411116.2411120",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Dec 11 08:06:57 MST 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "This paper focuses on the capacity of wireless ad hoc
                 networks and analyzes the effect of key factors viz.
                 network size, traffic patterns and detailed local radio
                 interactions on the capacity of such networks. The
                 capacity is evaluated with several different network
                 layouts and traffic patterns through simulations. To
                 demonstrate the impact of these factors, the capacity
                 evaluation starts with a simple case of a chain of
                 evenly spaced nodes in a network environment and
                 progresses to a network with random traffic and
                 randomly spaced nodes. Initially, capacity of static
                 nodes is evaluated for various network layouts and
                 traffic patterns. Since, in most scenarios, nodes do
                 not travel significant distances during packet
                 transmissions. As an enhancement, mobility of nodes is
                 introduced into the network scenario and the
                 performance is again evaluated. The simulations are
                 carried out using OPNET modeler and the results
                 obtained are presented in this report. The results are
                 analyzed to understand the impact of these factors on
                 the capacity and consequently suggest measures to
                 increase the same. This work shows that the achievable
                 capacity of ad hoc network depends on network size,
                 traffic pattern and mobility. In a single cell
                 topology, it is found that there is a 50\% reduction in
                 network throughput, if the node size increases from 2
                 to 10 nodes, whereas there is a 74\% reduction in the
                 throughput for chain topology for the same increase in
                 node size. In a lattice topology with horizontal
                 traffic, there is a 46 \% reduction in network
                 throughput when the lattice size increases from $ 4
                 \times 4 $ to $ 5 \times 5 $. The same percentage of
                 reduction is observed when both horizontal and network
                 traffic is introduced. In a random network topology
                 with random traffic, there is an 80 \% reduction in
                 network throughput when the node size increases from
                 150 to 750 nodes. However, for the same scenario with
                 the introduction of mobility to the nodes, a slight
                 improvement is achieved with an overall 75\% reduction
                 in network throughput.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Isa:2012:EAS,
  author =       "M. N. Isa and K. Benkrid and T. Clayton",
  title =        "Efficient architecture and scheduling technique for
                 pairwise sequence alignment",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "4",
  pages =        "26--31",
  month =        sep,
  year =         "2012",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2411116.2411121",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Dec 11 08:06:57 MST 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "A novel efficient hardware architecture to optimize
                 the functions to be implemented on FPGAs. This riding
                 curve of execution time of dynamic programming-based
                 (DP) pairwise the process technology emerges the use of
                 FPGAs in sequence alignment algorithms in hardware is
                 proposed. It is realized by introducing an efficient
                 overlapped scheduling of alignment matrix computation
                 and substitution coefficients' pre-loading onto
                 processing elements (PEs) in folded systolic arrays. A
                 new metric is also proposed as an independent
                 performance evaluator to compare different core
                 implementations on different FPGA platforms fairly.
                 Implementation results show that the new hardware
                 architecture for sequence alignment achieves a minimum
                 of 40 percent area normalized speed-up compared to the
                 state-of-the-art hardware implementation, with the
                 speed-up growing linearly with the number of folds e.g.
                 120 percent speed up for 16-fold. Compared to
                 equivalent software implementations, the novel hardware
                 architecture achieves a minimum of $ 103 \times $
                 speed-up, with the speed-up growing linearly with the
                 number of folds e.g. $ 140 \times $ speed-up for
                 20-fold.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Oudjida:2012:NHR,
  author =       "A. K. Oudjida and N. Chaillet and M. L. Berrandjia and
                 A. Liacha",
  title =        "A new high radix-2 $r$ ($ r \geq 8$) multibit recoding
                 algorithm for large operand size ({$ N \geq 32$})
                 multipliers",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "4",
  pages =        "32--43",
  month =        sep,
  year =         "2012",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2411116.2411122",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Dec 11 08:06:57 MST 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "This paper addresses the problem of multiplication
                 with large operand sizes ($ N \geq 32$). We propose a
                 new recursive recoding algorithm that shortens the
                 critical path of the multiplier and reduces the
                 hardware complexity of partial-product-generators as
                 well. The new recoding algorithm provides an optimal
                 space/time partitioning of the multiplier architecture
                 for any size $N$ of the operands. As a result, the
                 critical path is drastically reduced to $ 3^3 \sqrt N /
                 2 - 3$ with no area overhead in comparison to modified
                 Booth algorithm that shows a critical path of $ N / 2$
                 in adder stages. For instance, only $7$ adder stages
                 are needed for a 64-bit two's complement multiplier.
                 Confronted to reference algorithms for $ N = 64$,
                 important gain ratios of $ 1.62$, $ 1.71$, $ 2.64$ are
                 obtained in terms of multiply-time, energy consumption
                 per multiply operation, and total gate count,
                 respectively.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Thorson:2012:INb,
  author =       "Mark Thorson",
  title =        "{Internet} nuggets",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "4",
  pages =        "44--48",
  month =        sep,
  year =         "2012",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2411116.2411124",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Dec 11 08:06:57 MST 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "This column consists of selected traffic from the {\tt
                 comp.arch} newsgroup, a forum for discussion of
                 computer architecture on the Internet---an
                 international computer network. As always, the opinions
                 expressed in this column are the personal views of the
                 authors, and do not necessarily represent the
                 institutions to which they are affiliated. Text which
                 sets the context of a message appears underlined or in
                 italics; this is usually text the author has quoted
                 from earlier messages. The code-like expressions below
                 the authors' names are their addresses on Internet.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Amano:2012:FBC,
  author =       "Hideharu Amano and Wayne Luk",
  title =        "{FPGA}-based {Connect6} solver with
                 hardware-accelerated move refinement",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "5",
  pages =        "4--9",
  month =        dec,
  year =         "2012",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2460216.2460218",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Sun May 5 09:49:56 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "HEART '12 conference proceedings.",
  abstract =     "Connect6 is a two-player game similar to Go-Moku,
                 which was introduced in 2003. Since placing two stones
                 in each turn makes a huge game-tree, we require some
                 acceleration techniques for a solver based on a typical
                 approach to search the tree. This paper presents an
                 FPGA-based Connect6 solver with two-level move
                 refinement. The solver has the dedicated hardware to
                 accelerate the move refinement by exploiting various
                 parallelism with a systolic array, linear arrays, and
                 multiple score-calculation units. Implementation with a
                 low-end FPGA demonstrates that the accelerator allows
                 the two-level move refinement in the FPGA-based solver
                 running at 90 MHz to be 103695 and 414 times faster
                 than equivalent software implementation with NIOS II
                 soft processor on the FPGA and Intel Core i7 processor
                 operating at 2.93 GHz, respectively.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Chau:2012:RRP,
  author =       "Thomas C. P. Chau and Wayne Luk and Peter Y. K.
                 Cheung",
  title =        "{Roberts}: reconfigurable platform for benchmarking
                 real-time systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "5",
  pages =        "10--15",
  month =        dec,
  year =         "2012",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2460216.2460219",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Sun May 5 09:49:56 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "HEART '12 conference proceedings.",
  abstract =     "This paper presents Roberts, a Reconfigurable platfOrm
                 for BEnchmarking Real-Time Systems. Roberts is the
                 first platform which can be customised for a given
                 system-under-test to support benchmarking of real-time
                 properties and energy consumption. The benchmarking
                 takes into account system workload and environmental
                 events, with facilities for generating test vectors
                 conforming to the specification of system under test,
                 and with support for on-line monitoring of the response
                 time, output values and energy consumption. The
                 proposed benchmarking platform has been implemented in
                 the DE4 development system to provide cycle-accurate
                 timing measurement at nano-second precision to analyse
                 high performance applications. An evaluation of our
                 approach shows that the platform can be used in
                 analysing the performance of target applications and
                 overheads of other timing facilities, such as the
                 interval timer on processors.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kinoshita:2012:ARS,
  author =       "Kei Kinoshita and Daisuke Takano and Tomoyuki Okamura
                 and Tetsuhiko Yao and Yoshiki Yamaguchi",
  title =        "An augmented reality system with a coarse-grained
                 reconfigurable device",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "5",
  pages =        "16--21",
  month =        dec,
  year =         "2012",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2460216.2460220",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Sun May 5 09:49:56 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "HEART '12 conference proceedings.",
  abstract =     "Image recognition and motion tracking are widely
                 utilized in the field of Augmented Reality (AR).
                 Although their computational cost is huge, they enable
                 to extend the practicality and the range of
                 applications if all computation is processed within
                 real time. Toward this goal, in this paper, we propose
                 a handheld AR system optimized for direct hardware
                 computation. It includes a subspace method for image
                 recognition and a KLT tracking algorithm for motion
                 tracking. The AR system is composed of one
                 two-million-pixel-CCD-image sensor, one head-mounted
                 display, one reconfigurable device called DAPDNA-2, and
                 so on. DAPDNA-2 is a coarse-grained and
                 dynamic-reconfigurable device which is produced by
                 Tokyo Keiki Inc. The merit of DAPDNA-2 is its
                 short-reconfiguration time and it is utilised to full
                 for not only high performance but also the reduction of
                 power consumption. The experimental result through a
                 real Japanese-English translation system shows image
                 recognition and motion tracking are computed within
                 real-time; the computation time is less than 0.741
                 milliseconds per a VGA-resolution (640 x 480 pixels)
                 frame. Thus, we are able to find a highly efficient
                 computation using a coarse-grained architecture
                 compared with general-purpose processors and embedded
                 processors.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ng:2012:STT,
  author =       "Nicholas Ng and Nobuko Yoshida and Xin Yu Niu and Kuen
                 Hung Tsoi",
  title =        "Session types: towards safe and fast reconfigurable
                 programming",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "5",
  pages =        "22--27",
  month =        dec,
  year =         "2012",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2460216.2460221",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Sun May 5 09:49:56 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "HEART '12 conference proceedings.",
  abstract =     "This paper introduces a new programming framework
                 based on the theory of session types for safe,
                 reconfigurable parallel designs. We apply the session
                 type theory to C and Java programming languages and
                 demonstrate that the session-based languages can offer
                 a clear and tractable framework to describe
                 communications between parallel components and
                 guarantee communication-safety and deadlock-freedom by
                 compile-time type checking. Many representative
                 communication topologies such as a ring or
                 scatter-gather can be programmed and verified in
                 session-based programming languages. Case studies
                 involving N-body simulation and Kmeans clustering are
                 used to illustrate the session-based programming style
                 and to demonstrate that the session-based languages
                 perform competitively against MPI counterparts in an
                 FPGA-based heterogeneous cluster, as well as the
                 potential of integrating them with FPGA acceleration.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Syed:2012:LOA,
  author =       "Rizwan Syed and Yajun Ha and Bharadwaj Veeravalli",
  title =        "A low overhead abstract architecture for {FPGA}
                 resource management",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "5",
  pages =        "28--33",
  month =        dec,
  year =         "2012",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2460216.2460222",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Sun May 5 09:49:56 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "HEART '12 conference proceedings.",
  abstract =     "To support dynamic applications, FPGAs will need to
                 have a software operating system equivalent resource
                 manager. An abstract FPGA architecture is the
                 foundation to develop such an FPGA resource manager.
                 Previous research projects work on the FPGA abstraction
                 by abstracting the computing and/or the communication
                 resources. However, various constraints made their
                 proposals practically less useful due to the
                 performance and/or the area overheads. We develop a low
                 overhead abstract FPGA architecture that has the
                 important features such as dynamically sized
                 reconfigurable regions, deterministic communications
                 among regions, clock network management and in-circuit
                 debugging for regions. The architecture is demonstrated
                 by implementing three applications on the Xilinx Virtex
                 5 FPGAs. We evaluate our work by comparing the area and
                 performance overheads due to the abstractions between
                 the abstracted and the non-abstracted applications.
                 Experimental results show that additional resources
                 required due to abstractions are found to be 6.4\% on
                 average. This is achieved with low overheads on the
                 timing performance.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Tsoi:2012:MRS,
  author =       "Kuen Hung Tsoi and Tobias Becker and Wayne Luk",
  title =        "Modelling reconfigurable systems in event driven
                 simulation",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "5",
  pages =        "34--39",
  month =        dec,
  year =         "2012",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2460216.2460223",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Sun May 5 09:49:56 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "HEART '12 conference proceedings.",
  abstract =     "Reconfigurable platforms allow hardware developers to
                 customise their designs for specific applications.
                 However, their adoption involves challenges in
                 understanding and estimating the impact of various
                 design parameters and approaches. This paper proposes a
                 unified framework to model behaviour of reconfigurable
                 systems using an event driven simulation approach. This
                 provides an abstract yet informative method to capture
                 both analytical relationships and empirical parameters
                 of reconfigurable systems. It can be used to help
                 making design decisions or verifying analytical models.
                 We apply this approach to three models of
                 reconfigurable applications to estimate the
                 communication efficiency of networked clusters, and the
                 performance and energy efficiency of runtime
                 reconfigurable designs for software-defined radio and
                 for option pricing in finance. The results show that,
                 through this simulation framework, we can verify the
                 accuracy of analytical models and also obtain practical
                 information that is not provided by analytical
                 models.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Shun:2012:FAC,
  author =       "Zheng Zhi Shun and Tsutomu Maruyama",
  title =        "{FPGA} acceleration of {CDO} pricing based on
                 correlation expansions",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "5",
  pages =        "40--45",
  month =        dec,
  year =         "2012",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2460216.2460224",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Sun May 5 09:49:56 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "HEART '12 conference proceedings.",
  abstract =     "Because of the significant growth in the financial
                 market, faster and accurate pricing of widespread
                 instruments is becoming more important. In this paper,
                 we describe an FPGA implementation of an analytical
                 method for collateralized debt obligation (CDO) pricing
                 in the multifactor Normal Copula model. Our experiments
                 show that the FPGA system is about 40 times faster than
                 corresponding software on a single core 3 GHz Intel
                 Core2 processor.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Nakahara:2012:WFF,
  author =       "Hiroki Nakahara and Hiroyuki Nakanishi and Tsutomu
                 Sasao",
  title =        "On a wideband {Fast Fourier Transform} for a radio
                 telescope",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "5",
  pages =        "46--51",
  month =        dec,
  year =         "2012",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2460216.2460225",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Sun May 5 09:49:56 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "HEART '12 conference proceedings.",
  abstract =     "The radio telescope analyzes a radio frequency from
                 celestial objects by using fast Fourier transform
                 (FFT). In this application, its bandwidth f is wider
                 than that of the typical FFT. Since the amount of
                 hardware for the typical FFT circuit is proportional to
                 the bandwidth f, a special technique is necessary for
                 this application. This paper shows a realization of
                 wideband FFT for the radio telescope on an FPGA. We
                 show that the memory size for the conventional FFT,
                 which consists of the twiddle factor memory and the
                 transpose memory, is too large. We replace the twiddle
                 factor memory with the pipelined CORDIC. To reduce the
                 number of transpose memories, we increase the radix of
                 the FFT from 22 to 2k, also we use the DDR2SDRAM to
                 implement the transpose memory. We implement the
                 230-FFT on an Altera's Stratix IV GX530 FPGA. It
                 performs the 230-FFT operations in 1.5 seconds.
                 Compared with the Altera's FFT library, our FFT circuit
                 realizes 214 times wider bandwidth on the same FPGA.
                 Also, compared with Tesla S1070 utilizing four GPUs,
                 our FFT circuit is faster and dissipates lower power.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ling:2012:HPP,
  author =       "Cheng Ling and Khaled Benkrid and Tsuyoshi Hamada",
  title =        "High performance phylogenetic analysis on
                 {CUDA}-compatible {GPUs}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "5",
  pages =        "52--57",
  month =        dec,
  year =         "2012",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2460216.2460226",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Sun May 5 09:49:56 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "HEART '12 conference proceedings.",
  abstract =     "The operation of phylogenetic analysis aims to
                 investigate the evolution and relationships among
                 species. It is widely used in the fields of system
                 biology and comparative genomics. However, phylogenetic
                 analysis is also a computationally intensive operation
                 as the number of tree topology grows in a factorial way
                 with the number of species involved. Therefore, due to
                 the large number of species in the real world, the
                 computational burden has largely thwarted phylogenetic
                 reconstruction. In this paper, we describe the detailed
                 GPU-based multi-threaded design and implementation of a
                 Markov Chain Monte Carlo (MCMC) maximum likelihood
                 algorithm for phylogenetic analysis on a set of aligned
                 nucleotide sequences. The implementation is based on
                 the framework of the most widely used phylogenetic
                 analysis tool, namely MrBayes. The proposed approach
                 resulted in 6x-8x speed-up on an NVidia Geforce 460 GTX
                 GPU compared to an optimized GPP-based software
                 implementation running on a desktop computer with a
                 single Intel Xeon 2.53 GHz CPU and 6.0 GB RAM.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Lin:2012:EED,
  author =       "Colin Yu Lin and Hayden Kwok-Hay Kwok-Hay So",
  title =        "Energy-efficient dataflow computations on {FPGAs}
                 using application-specific coarse-grain architecture
                 synthesis",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "5",
  pages =        "58--63",
  month =        dec,
  year =         "2012",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2460216.2460227",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Sun May 5 09:49:56 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "HEART '12 conference proceedings.",
  abstract =     "Compiling high-level user applications to execute on
                 FPGA-based reconfigurable computers often involve
                 synthesizing dataflow graphs beyond the capacity of the
                 available hardware resources. A framework that provides
                 rapid and energy-efficient compilation of such dataflow
                 graphs on FPGAs using an array of pre-placed
                 configurable processing elements is proposed. The
                 mapping schedule of the compute operations on the CPEs
                 and the direct network among the CPEs are
                 co-synthesized on a per-application basis to provide
                 the targeted power-performance tradeoff. Compared to
                 the use of a fixed generic topology, the use of an
                 application-specific topology derived by a genetic
                 algorithm can achieve up to 28\% improvement in
                 energy-delay product. As the CPEs are pre-placed,
                 compiling for a new application involve only the
                 generation of a new operation schedule, which is stored
                 in on-chip memory, and the new routes among the CPEs.
                 With optimization in operation scheduling and mapping
                 and application-specific interconnect network, the
                 proposed framework achieved up to 199X better
                 energy-delay product compared to a traditional FPGA
                 high-level synthesis tool xPilot. The use of such
                 framework is anticipated to serve as part of a
                 high-level application compiler for hybrid CPU-FPGA
                 computation.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Malik:2012:ERA,
  author =       "Jamshaid Sarwar Malik and Paolo Palazzari and Ahmed
                 Hemani",
  title =        "Effort, resources, and abstraction vs performance in
                 high-level synthesis: finding new answers to an old
                 question",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "5",
  pages =        "64--69",
  month =        dec,
  year =         "2012",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2460216.2460228",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Sun May 5 09:49:56 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "HEART '12 conference proceedings.",
  abstract =     "This work provides new perspectives on impact of
                 design effort, consumed resources and design
                 abstraction on hardware performance in a high-level
                 synthesis flow. We have shown that counter to published
                 literature as well as intuition; more design effort may
                 not always result in better performance. We developed a
                 kernel that simulates Brownian motion, and investigated
                 improvement in hardware performance with design effort
                 at various abstraction levels. Our results indicate
                 that a designer should be careful in putting more
                 effort at a particular abstraction level. In our case,
                 we achieved best performance/effort ratio at algorithm
                 level rather than lower abstraction levels. This
                 strongly suggests that design effort is not always
                 proportional to corresponding improvement in
                 performance.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kakimoto:2012:PCG,
  author =       "Takeshi Kakimoto and Keisuke Dohi and Yuichiro Shibata
                 and Kiyoshi Oguri",
  title =        "Performance comparison of {GPU} programming frameworks
                 with the striped {Smith--Waterman} algorithm",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "5",
  pages =        "70--75",
  month =        dec,
  year =         "2012",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2460216.2460229",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Sun May 5 09:49:56 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "HEART '12 conference proceedings.",
  abstract =     "This paper evaluates and discusses how different GPU
                 programming frameworks affect the performance obtained
                 from GPU acceleration of the striped smith-waterman
                 algorithm used for biological sequence alignment. A
                 total of 6 GPU implementations of the algorithm on
                 NVIDIA GT200b and AMD RV870 using the CUDA and the
                 OpenCL frameworks are compared to analyze cons and pros
                 of explicit descriptions for architecture specific
                 hardware mechanisms in the code. The evaluation results
                 show that the primitive descriptions with the CUDA are
                 still efficient especially for small size data, while
                 better instruction scheduling and optimizations are
                 carried out by the OpenCL compiler. On the other hand,
                 the combination of OpenCL and RV870 which provides a
                 relatively simple view of the architecture is efficient
                 for the large data size.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Tribino:2012:PPA,
  author =       "Julien Tribino and Antoine Trouv{\'e} and Hadrien A.
                 Clarke and Kazuaki J. Murakami",
  title =        "{PASTIS}: a photonic arbitration with scalable token
                 injection scheme",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "5",
  pages =        "76--81",
  month =        dec,
  year =         "2012",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2460216.2460230",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Sun May 5 09:49:56 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "HEART '12 conference proceedings.",
  abstract =     "This paper introduces PASTIS, a novel photonic
                 arbitration protocol based on a scalable token
                 injection scheme, and ring-based nanophotonic
                 technology. It aims at connecting together processors
                 and memories in many-core computer systems by means of
                 a ring topology. The main strength of PASTIS lays in
                 the fact that it uses photonic components exclusively,
                 that is, routing does not require any electronics. In
                 this work, we compare it with an hybrid opto-electronic
                 protocol as presented in a related work. Simulations
                 show that PASTIS performs better in terms of bandwidth,
                 latency and energy consumption. Indeed, it is scalable
                 as it can adapt its bandwidth to the system's workload,
                 thereby saving energy. Finally, we also study the
                 opportunity of using reconfigurable rings. We determine
                 that they almost halve the overall static power
                 consumption.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Watanabe:2012:MCP,
  author =       "Takahiro Watanabe and Minoru Watanabe",
  title =        "$ 0.18 \mu $ m {CMOS} process high-sensitivity
                 optically reconfigurable gate array {VLSI}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "5",
  pages =        "82--86",
  month =        dec,
  year =         "2012",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2460216.2460231",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Sun May 5 09:49:56 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "HEART '12 conference proceedings.",
  abstract =     "Currently, demand for high-speed dynamic
                 reconfiguration of a programmable device is increasing
                 for the purpose of increasing the performance of such
                 devices. To support the high speed dynamic
                 reconfiguration, optically reconfigurable gate arrays
                 (ORGAs) have been developed up to now. An ORGA consists
                 of a holographic memory, a laser array, and an
                 optically reconfigurable gate array VLSI. The
                 holographic memory can store many configuration
                 contexts. In addition, its large bandwidth optical
                 connection enables high speed reconfiguration. However,
                 photodiode sensitivities of conventional ORGAs were not
                 good. This paper therefore presents a newly fabricated
                 $ 0.18 \pi $ m CMOS process optically reconfigurable
                 gate array VLSI chip with highly sensitive
                 photociruits.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Nakaya:2012:NVR,
  author =       "Shogo Nakaya and Makoto Miyamura and Noboru Sakimura
                 and Yuichi Nakamura and Tadahiko Sugibayashi",
  title =        "A non-volatile reconfigurable offloader for wireless
                 sensor nodes",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "5",
  pages =        "87--92",
  month =        dec,
  year =         "2012",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2460216.2460232",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Sun May 5 09:49:56 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "HEART '12 conference proceedings.",
  abstract =     "Energy saving is currently one of the most important
                 issues in the development of battery-powered wireless
                 sensor nodes (WSNs). We have developed a non-volatile
                 reconfigurable offloader for flexible and highly
                 efficient processing on WSNs that uses NanoBridges
                 (NBs), which are novel non-volatile and reprogrammable
                 switching elements. Non-volatility is essential for the
                 intermittent operation of WSNs due to the requirement
                 of power-on without loading configuration data. We
                 implemented a data compression algorithm on the
                 offloader that reduces energy consumption during data
                 transmission. Simulation results showed that the energy
                 consumption on the offloader was $ 11 / 21 $ of that on
                 an ultra-low power CPU.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Thorson:2012:INc,
  author =       "Mark Thorson",
  title =        "{Internet} nuggets",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "40",
  number =       "5",
  pages =        "93--112",
  month =        dec,
  year =         "2012",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2460216.2460234",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Sun May 5 09:49:56 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "HEART '12 conference proceedings.",
  abstract =     "This column consists of selected traffic from the
                 comp.arch newsgroup, a forum for discussion of computer
                 architecture on the Internet---an international
                 computer network. As always, the opinions expressed in
                 this column are the personal views of the authors, and
                 do not necessarily represent the institutions to which
                 they are affiliated. Text which sets the context of a
                 message appears underlined or in italics; this is
                 usually text the author has quoted from earlier
                 messages. The code-like expressions below the authors'
                 names are their addresses on Internet.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Bond:2013:GDG,
  author =       "Michael Bond",
  title =        "{GPUDet}: a deterministic {GPU} architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "1",
  pages =        "1--12",
  month =        mar,
  year =         "2013",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2490301.2451118",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:40:49 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Nondeterminism is a key challenge in developing
                 multithreaded applications. Even with the same input,
                 each execution of a multithreaded program may produce a
                 different output. This behavior complicates debugging
                 and limits one's ability to test for correctness. This
                 non-reproducibility situation is aggravated on
                 massively parallel architectures like graphics
                 processing units (GPUs) with thousands of concurrent
                 threads. We believe providing a deterministic
                 environment to ease debugging and testing of GPU
                 applications is essential to enable a broader class of
                 software to use GPUs. Many hardware and software
                 techniques have been proposed for providing determinism
                 on general-purpose multi-core processors. However,
                 these techniques are designed for small numbers of
                 threads. Scaling them to thousands of threads on a GPU
                 is a major challenge. This paper proposes a scalable
                 hardware mechanism, GPUDet, to provide determinism in
                 GPU architectures. In this paper we characterize the
                 existing deterministic and nondeterministic aspects of
                 current GPU execution models, and we use these
                 observations to inform GPUDet's design. For example,
                 GPUDet leverages the inherent determinism of the SIMD
                 hardware in GPUs to provide determinism within a
                 wavefront at no cost. GPUDet also exploits the Z-Buffer
                 Unit, an existing GPU hardware unit for graphics
                 rendering, to allow parallel out-of-order memory writes
                 to produce a deterministic output. Other optimizations
                 in GPUDet include deterministic parallel execution of
                 atomic operations and a workgroup-aware algorithm that
                 eliminates unnecessary global synchronizations. Our
                 simulation results indicate that GPUDet incurs only 2X
                 slowdown on average over a baseline nondeterministic
                 architecture, with runtime overheads as low as 4\% for
                 compute-bound applications, despite running GPU kernels
                 with thousands of threads. We also characterize the
                 sources of overhead for deterministic execution on GPUs
                 to provide insights for further optimizations.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS '13 conference proceedings.",
}

@Article{Sung:2013:DEH,
  author =       "Hyojin Sung and Rakesh Komuravelli and Sarita V.
                 Adve",
  title =        "{DeNovoND}: efficient hardware support for disciplined
                 non-determinism",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "1",
  pages =        "13--26",
  month =        mar,
  year =         "2013",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2490301.2451119",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:40:49 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Recent work has shown that disciplined shared-memory
                 programming models that provide
                 deterministic-by-default semantics can simplify both
                 parallel software and hardware. Specifically, the
                 DeNovo hardware system has shown that the software
                 guarantees of such models (e.g., data-race-freedom and
                 explicit side-effects) can enable simpler, higher
                 performance, and more energy-efficient hardware than
                 the current state-of-the-art for deterministic
                 programs. Many applications, however, contain
                 non-deterministic parts; e.g., using lock
                 synchronization. For commercial hardware to exploit the
                 benefits of DeNovo, it is therefore necessary to extend
                 DeNovo to support non-deterministic applications. This
                 paper proposes DeNovoND, a system that supports
                 lock-based, disciplined non-determinism, with the
                 simplicity, performance, and energy benefits of DeNovo.
                 We use a combination of distributed queue-based locks
                 and access signatures to implement simple memory
                 consistency semantics for safe non-determinism, with a
                 coherence protocol that does not require transient
                 states, invalidation traffic, or directories, and does
                 not incur false sharing. The resulting system is
                 simpler, shows comparable or better execution time, and
                 has 33\% less network traffic on average (translating
                 directly into energy savings) relative to a
                 state-of-the-art invalidation-based protocol for 8
                 applications designed for lock synchronization.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS '13 conference proceedings.",
}

@Article{Wester:2013:PDR,
  author =       "Benjamin Wester and David Devecsery and Peter M. Chen
                 and Jason Flinn and Satish Narayanasamy",
  title =        "Parallelizing data race detection",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "1",
  pages =        "27--38",
  month =        mar,
  year =         "2013",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2490301.2451120",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:40:49 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Detecting data races in multithreaded programs is a
                 crucial part of debugging such programs, but
                 traditional data race detectors are too slow to use
                 routinely. This paper shows how to speed up race
                 detection by spreading the work across multiple cores.
                 Our strategy relies on uniparallelism, which executes
                 time intervals of a program (called epochs ) in
                 parallel to provide scalability, but executes all
                 threads from a single epoch on a single core to
                 eliminate locking overhead. We use several techniques
                 to make parallelization effective: dividing race
                 detection into three phases, predicting a subset of the
                 analysis state, eliminating sequential work via
                 transitive reduction, and reducing the work needed to
                 maintain multiple versions of analysis via
                 factorization. We demonstrate our strategy by
                 parallelizing a happens-before detector and a
                 lockset-based detector. We find that uniparallelism can
                 significantly speed up data race detection. With 4x the
                 number of cores as the original application, our
                 strategy speeds up the median execution time by 4.4x
                 for a happens-before detector and 3.3x for a lockset
                 race detector. Even on the same number of cores as the
                 conventional detectors, the ability for uniparallelism
                 to elide analysis locks allows it to reduce the median
                 overhead by 13\% for a happens-before detector and 8\%
                 for a lockset detector.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS '13 conference proceedings.",
}

@Article{Lucia:2013:CEF,
  author =       "Brandon Lucia and Luis Ceze",
  title =        "Cooperative empirical failure avoidance for
                 multithreaded programs",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "1",
  pages =        "39--50",
  month =        mar,
  year =         "2013",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2490301.2451121",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:40:49 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Concurrency errors in multithreaded programs are
                 difficult to find and fix. We propose Aviso, a system
                 for avoiding schedule-dependent failures. Aviso
                 monitors events during a program's execution and, when
                 a failure occurs, records a history of events from the
                 failing execution. It uses this history to generate
                 schedule constraints that perturb the order of events
                 in the execution and thereby avoids schedules that lead
                 to failures in future program executions. Aviso
                 leverages scenarios where many instances of the same
                 software run, using a statistical model of program
                 behavior and experimentation to determine which
                 constraints most effectively avoid failures. After
                 implementing Aviso, we showed that it decreased failure
                 rates for a variety of important desktop, server, and
                 cloud applications by orders of magnitude, with an
                 average overhead of less than 20\% and, in some cases,
                 as low as 5\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS '13 conference proceedings.",
}

@Article{Goiri:2013:PGM,
  author =       "{\'I}{\~n}igo Goiri and William Katsak and Kien Le and
                 Thu D. Nguyen and Ricardo Bianchini",
  title =        "{Parasol} and {GreenSwitch}: managing datacenters
                 powered by renewable energy",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "1",
  pages =        "51--64",
  month =        mar,
  year =         "2013",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2490301.2451123",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:40:49 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Several companies have recently announced plans to
                 build ``green'' datacenters, i.e. datacenters partially
                 or completely powered by renewable energy. These
                 datacenters will either generate their own renewable
                 energy or draw it directly from an existing nearby
                 plant. Besides reducing carbon footprints, renewable
                 energy can potentially reduce energy costs, reduce peak
                 power costs, or both. However, certain renewable fuels
                 are intermittent, which requires approaches for
                 tackling the energy supply variability. One approach is
                 to use batteries and/or the electrical grid as a backup
                 for the renewable energy. It may also be possible to
                 adapt the workload to match the renewable energy
                 supply. For highest benefits, green datacenter
                 operators must intelligently manage their workloads and
                 the sources of energy at their disposal. In this paper,
                 we first discuss the tradeoffs involved in building
                 green datacenters today and in the future. Second, we
                 present Parasol, a prototype green datacenter that we
                 have built as a research platform. Parasol comprises a
                 small container, a set of solar panels, a battery bank,
                 and a grid-tie. Third, we describe GreenSwitch, our
                 model-based approach for dynamically scheduling the
                 workload and selecting the source of energy to use. Our
                 real experiments with Parasol, GreenSwitch, and
                 MapReduce workloads demonstrate that intelligent
                 workload and energy source management can produce
                 significant cost reductions. Our results also isolate
                 the cost implications of peak power management, storing
                 energy on the grid, and the ability to delay the
                 MapReduce jobs. Finally, our results demonstrate that
                 careful workload and energy source management can
                 minimize the negative impact of electrical grid
                 outages.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS '13 conference proceedings.",
}

@Article{Shen:2013:PCF,
  author =       "Kai Shen and Arrvindh Shriraman and Sandhya Dwarkadas
                 and Xiao Zhang and Zhuan Chen",
  title =        "Power containers: an {OS} facility for fine-grained
                 power and energy management on multicore servers",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "1",
  pages =        "65--76",
  month =        mar,
  year =         "2013",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2490301.2451124",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:40:49 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Energy efficiency and power capping are critical
                 concerns in server and cloud computing systems. They
                 face growing challenges due to dynamic power variations
                 from new client-directed web applications, as well as
                 complex behaviors due to multicore resource sharing and
                 hardware heterogeneity. This paper presents a new
                 operating system facility called ``power containers''
                 that accounts for and controls the power and energy
                 usage of individual fine-grained requests in multicore
                 servers. This facility relies on three key techniques
                 --- (1) online model that attributes multicore power
                 (including shared maintenance power) to concurrently
                 running tasks, (2) alignment of actual power
                 measurements and model estimates to enable online model
                 recalibration, and (3) on-the-fly
                 application-transparent request tracking in multi-stage
                 servers to isolate the power and energy contributions
                 and customize per-request control. Our mechanisms
                 enable new multicore server management capabilities
                 including fair power capping that only penalizes
                 power-hungry requests, and energy-aware request
                 distribution between heterogeneous servers. Our
                 evaluation uses three multicore processors (Intel
                 Woodcrest, Westmere, and SandyBridge) and a variety of
                 server and cloud computing (Google App Engine)
                 workloads. Our results demonstrate the high accuracy of
                 our request power accounting (no more than 11\% errors)
                 and the effectiveness of container-enabled power virus
                 isolation and throttling. Our request distribution case
                 study shows up to 25\% energy saving compared to an
                 alternative approach that recognizes machine
                 heterogeneity but not fine-grained workload affinity.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS '13 conference proceedings.",
}

@Article{Delimitrou:2013:PQA,
  author =       "Christina Delimitrou and Christos Kozyrakis",
  title =        "{Paragon}: {QoS}-aware scheduling for heterogeneous
                 datacenters",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "1",
  pages =        "77--88",
  month =        mar,
  year =         "2013",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2490301.2451125",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:40:49 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Large-scale datacenters (DCs) host tens of thousands
                 of diverse applications each day. However, interference
                 between colocated workloads and the difficulty to match
                 applications to one of the many hardware platforms
                 available can degrade performance, violating the
                 quality of service (QoS) guarantees that many cloud
                 workloads require. While previous work has identified
                 the impact of heterogeneity and interference, existing
                 solutions are computationally intensive, cannot be
                 applied online and do not scale beyond few
                 applications. We present Paragon, an online and
                 scalable DC scheduler that is heterogeneity and
                 interference-aware. Paragon is derived from robust
                 analytical methods and instead of profiling each
                 application in detail, it leverages information the
                 system already has about applications it has previously
                 seen. It uses collaborative filtering techniques to
                 quickly and accurately classify an unknown, incoming
                 workload with respect to heterogeneity and interference
                 in multiple shared resources, by identifying
                 similarities to previously scheduled applications. The
                 classification allows Paragon to greedily schedule
                 applications in a manner that minimizes interference
                 and maximizes server utilization. Paragon scales to
                 tens of thousands of servers with marginal scheduling
                 overheads in terms of time or state. We evaluate
                 Paragon with a wide range of workload scenarios, on
                 both small and large-scale systems, including 1,000
                 servers on EC2. For a 2,500-workload scenario, Paragon
                 enforces performance guarantees for 91\% of
                 applications, while significantly improving
                 utilization. In comparison, heterogeneity-oblivious,
                 interference-oblivious and least-loaded schedulers only
                 provide similar guarantees for 14\%, 11\% and 3\% of
                 workloads. The differences are more striking in
                 oversubscribed scenarios where resource efficiency is
                 more critical.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS '13 conference proceedings.",
}

@Article{Tang:2013:RRS,
  author =       "Lingjia Tang and Jason Mars and Wei Wang and Tanima
                 Dey and Mary Lou Soffa",
  title =        "{ReQoS}: reactive static\slash dynamic compilation for
                 {QoS} in warehouse scale computers",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "1",
  pages =        "89--100",
  month =        mar,
  year =         "2013",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2490301.2451126",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:40:49 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "As multicore processors with expanding core counts
                 continue to dominate the server market, the overall
                 utilization of the class of datacenters known as
                 warehouse scale computers (WSCs) depends heavily on
                 colocation of multiple workloads on each server to take
                 advantage of the computational power provided by modern
                 processors. However, many of the applications running
                 in WSCs, such as websearch, are user-facing and have
                 quality of service (QoS) requirements. When multiple
                 applications are co-located on a multicore machine,
                 contention for shared memory resources threatens
                 application QoS as severe cross-core performance
                 interference may occur. WSC operators are left with two
                 options: either disregard QoS to maximize WSC
                 utilization, or disallow the co-location of
                 high-priority user-facing applications with other
                 applications, resulting in low machine utilization and
                 millions of dollars wasted. This paper presents ReQoS,
                 a static/dynamic compilation approach that enables
                 low-priority applications to adaptively manipulate
                 their own contentiousness to ensure the QoS of
                 high-priority co-runners. ReQoS is composed of a
                 profile guided compilation technique that identifies
                 and inserts markers in contentious code regions in
                 low-priority applications, and a lightweight runtime
                 that monitors the QoS of high-priority applications and
                 reactively reduces the pressure low-priority
                 applications generate to the memory subsystem when
                 cross-core interference is detected. In this work, we
                 show that ReQoS can accurately diagnose contention and
                 significantly reduce performance interference to ensure
                 application QoS. Applying ReQoS to SPEC2006 and
                 SmashBench workloads on real multicore machines, we are
                 able to improve machine utilization by more than 70\%
                 in many cases, and more than 50\% on average, while
                 enforcing a 90\% QoS threshold. We are also able to
                 improve the energy efficiency of modern multicore
                 machines by 47\% on average over a policy of
                 disallowing co-locations.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS '13 conference proceedings.",
}

@Article{Arulraj:2013:PRS,
  author =       "Joy Arulraj and Po-Chun Chang and Guoliang Jin and
                 Shan Lu",
  title =        "Production-run software failure diagnosis via hardware
                 performance counters",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "1",
  pages =        "101--112",
  month =        mar,
  year =         "2013",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2490301.2451128",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:40:49 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Sequential and concurrency bugs are widespread in
                 deployed software. They cause severe failures and huge
                 financial loss during production runs. Tools that
                 diagnose production-run failures with low overhead are
                 needed. The state-of-the-art diagnosis techniques use
                 software instrumentation to sample program properties
                 at run time and use off-line statistical analysis to
                 identify properties most correlated with failures.
                 Although promising, these techniques suffer from high
                 run-time overhead, which is sometimes over 100\%, for
                 concurrency-bug failure diagnosis and hence are not
                 suitable for production-run usage. We present PBI, a
                 system that uses existing hardware performance counters
                 to diagnose production-run failures caused by
                 sequential and concurrency bugs with low overhead. PBI
                 is designed based on several key observations. First, a
                 few widely supported performance counter events can
                 reflect a wide variety of common software bugs and can
                 be monitored by hardware with almost no overhead.
                 Second, the counter overflow interrupt supported by
                 existing hardware and operating systems provides a
                 natural and effective mechanism to conduct event
                 sampling at user level. Third, the noise and
                 non-determinism in interrupt delivery complements well
                 with statistical processing. We evaluate PBI using 13
                 real-world concurrency and sequential bugs from
                 representative open-source server, client, and utility
                 programs, and 10 bugs from a widely used
                 software-testing benchmark. Quantitatively, PBI can
                 effectively diagnose failures caused by these bugs with
                 a small overhead that is never higher than 10\%.
                 Qualitatively, PBI does not require any change to
                 software and presents a novel use of existing hardware
                 performance counters.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS '13 conference proceedings.",
}

@Article{Zhang:2013:CFC,
  author =       "Wei Zhang and Marc de Kruijf and Ang Li and Shan Lu
                 and Karthikeyan Sankaralingam",
  title =        "{ConAir}: featherweight concurrency bug recovery via
                 single-threaded idempotent execution",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "1",
  pages =        "113--126",
  month =        mar,
  year =         "2013",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2490301.2451129",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:40:49 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Many concurrency bugs are hidden in deployed software
                 and cause severe failures for end-users. When they
                 finally manifest and become known by developers, they
                 are difficult to fix correctly. To support end-users,
                 we need techniques that help software survive hidden
                 concurrency bugs during production runs. To help
                 developers, we need techniques that fix exposed
                 concurrency bugs. The state-of-the-art techniques on
                 concurrency-bug fixing and survival only satisfy a
                 subset of four important properties: compatibility,
                 correctness, generality, and performance.We aim to
                 develop a system that satisfies all of these four
                 properties. To achieve this goal, we leverage two
                 observations: (1) rolling back a single thread is
                 sufficient to recover from most concurrency-bug
                 failures; (2) reexecuting an idempotent region, which
                 requires no memory-state checkpoint, is sufficient to
                 recover from many concurrency-bug failures. Our system
                 ConAir includes a static analysis component that
                 automatically identifies potential failure sites, a
                 static analysis component that automatically identifies
                 the idempotent code regions around every failure site,
                 and a code-transformation component that inserts
                 rollback-recovery code around the identified idempotent
                 regions. We evaluated ConAir on 10 real-world
                 concurrency bugs in widely used C/C++ open-source
                 applications. These bugs cover different types of
                 failure symptoms and root causes. Quantitatively,
                 ConAir helps software survive failures caused by all of
                 these bugs with negligible run-time overhead ({$<$1}\%)
                 and short recovery time. Qualitatively, ConAir can help
                 recover from failures caused by unknown bugs. It
                 guarantees that program semantics remain unchanged and
                 requires no change to operating systems or hardware.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS '13 conference proceedings.",
}

@Article{Viennot:2013:TMR,
  author =       "Nicolas Viennot and Siddharth Nair and Jason Nieh",
  title =        "Transparent mutable replay for multicore debugging and
                 patch validation",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "1",
  pages =        "127--138",
  month =        mar,
  year =         "2013",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2490301.2451130",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:40:49 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "We present Dora, a mutable record-replay system which
                 allows a recorded execution of an application to be
                 replayed with a modified version of the application.
                 This feature, not available in previous record-replay
                 systems, enables powerful new functionality. In
                 particular, Dora can help reproduce, diagnose, and fix
                 software bugs by replaying a version of a recorded
                 application that is recompiled with debugging
                 information, reconfigured to produce verbose log
                 output, modified to include additional print
                 statements, or patched to fix a bug. Dora uses
                 lightweight operating system mechanisms to record an
                 application execution by capturing nondeterministic
                 events to a log without imposing unnecessary timing and
                 ordering constraints. It replays the log using a
                 modified version of the application even in the
                 presence of added, deleted, or modified operations that
                 do not match events in the log. Dora searches for a
                 replay that minimizes differences between the log and
                 the replayed execution of the modified program. If
                 there are no modifications, Dora provides deterministic
                 replay of the unmodified program. We have implemented a
                 Linux prototype which provides transparent mutable
                 replay without recompiling or relinking applications.
                 We show that Dora is useful for reproducing,
                 diagnosing, and fixing software bugs in real-world
                 applications, including Apache and MySQL. Our results
                 show that Dora (1) captures bugs and replays them with
                 applications modified or reconfigured to produce
                 additional debugging output for root cause diagnosis,
                 (2) captures exploits and replays them with patched
                 applications to validate that the patches successfully
                 eliminate vulnerabilities, (3) records production
                 workloads and replays them with patched applications to
                 validate patches with realistic workloads, and (4)
                 maintains low recording overhead on commodity multicore
                 hardware, making it suitable for production systems.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS '13 conference proceedings.",
}

@Article{Sahoo:2013:ULI,
  author =       "Swarup Kumar Sahoo and John Criswell and Chase Geigle
                 and Vikram Adve",
  title =        "Using likely invariants for automated software fault
                 localization",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "1",
  pages =        "139--152",
  month =        mar,
  year =         "2013",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2490301.2451131",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:40:49 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "We propose an automatic diagnosis technique for
                 isolating the root cause(s) of software failures. We
                 use likely program invariants, automatically generated
                 using correct inputs that are close to the
                 fault-triggering input, to select a set of candidate
                 program locations which are possible root causes. We
                 then trim the set of candidate root causes using
                 software-implemented dynamic backwards slicing, plus
                 two new filtering heuristics: dependence filtering, and
                 filtering via multiple failing inputs that are also
                 close to the failing input. Experimental results on
                 reported software bugs of three large open-source
                 servers show that we are able to narrow down the number
                 of candidate bug locations to between 5 and 17 program
                 expressions, even in programs that are hundreds of
                 thousands of lines long.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS '13 conference proceedings.",
}

@Article{Paulos:2013:REA,
  author =       "Eric Paulos",
  title =        "The rise of the expert amateur: {DIY} culture and the
                 evolution of computer science",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "1",
  pages =        "153--154",
  month =        mar,
  year =         "2013",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2490301.2451133",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:40:49 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "We are at an important technological inflection point.
                 Most of our computing systems have been designed and
                 built by professionally trained experts (i.e. us ---
                 computer scientists, engineers, and designers) for use
                 in specific domains and to solve explicit problems.
                 Artifacts often called ``user manuals'' traditionally
                 prescribed the appropriate usage of these tools and
                 implied an acceptable etiquette for interaction and
                 experience. A fringe group of individuals usually
                 labeled ``hackers'' or ``amateurs'' or ``makers'' have
                 challenged this producer-consumer model of technology
                 by creating novel hardware and software features to
                 ``improve'' our research and products while a similar
                 creative group of technicians called ``artists'' have
                 redirected the techniques, tools, and tenets of
                 accepted technological usage away from their typical
                 manifestations in practicality and product. Over time
                 the technological artifacts of these fringe groups and
                 the support for their rhetoric have gained them a
                 foothold into computing culture and eroded the
                 established power discontinuities within the practice
                 of computing research. We now expect our computing
                 tools to be driven by an architecture of open
                 participation and democracy that encourages users to
                 add value to their tools and applications as they use
                 them. Similarly, the bar for enabling the design of
                 novel, personal computing systems and ``hardware
                 remixes'' has fallen to the point where many
                 non-experts and novices are readily embracing and
                 creating fascinating and ingenious computing artifacts
                 outside of our official and traditionally sanctioned
                 academic and industrial research communities. But how
                 have we as ``expert'' practitioners been influencing
                 this discussion? By constructing a practice around the
                 design and development of technology for task based and
                 problem solving applications, we have unintentionally
                 established such work as the status quo for the human
                 computing experience. We have failed in our duty to
                 open up alternate forums for technology to express
                 itself and touch our lives beyond productivity and
                 efficiency. Blinded by our quest for ``smart
                 technologies'' we have forgotten to contemplate the
                 design of technologies to inspire us to be smarter,
                 more curious, and more inquisitive. We owe it to
                 ourselves to rethink the impact we desire to have on
                 this historic moment in computing culture. We must
                 choose to participate in and perhaps lead a dialogue
                 that heralds an expansive new acceptable practice of
                 designing to enable participation by experts and
                 non-experts alike. We are in the milieu of the rise of
                 the ``expert amateur''. We must change our mantra ---
                 not just performance, completeness, and usability but
                 openness, usefulness and relevancy to our world, its
                 citizens, and our environment. This talk will explore
                 elements of the DIY and maker culture and its relevancy
                 to research questions across computational hardware,
                 languages, and systems. Ultimately, this talk will
                 outline and argue for expanding the design territory
                 and potential opportunities for all of us to
                 collaborate and benefit as a society from this cultural
                 movement.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS '13 conference proceedings.",
}

@Article{Raghavan:2013:CSH,
  author =       "Arun Raghavan and Laurel Emurian and Lei Shao and
                 Marios Papaefthymiou and Kevin P. Pipe and Thomas F.
                 Wenisch and Milo M. K. Martin",
  title =        "Computational sprinting on a hardware\slash software
                 testbed",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "1",
  pages =        "155--166",
  month =        mar,
  year =         "2013",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2490301.2451135",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:40:49 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "CMOS scaling trends have led to an inflection point
                 where thermal constraints (especially in mobile devices
                 that employ only passive cooling) preclude sustained
                 operation of all transistors on a chip --- a phenomenon
                 called ``dark silicon.'' Recent research proposed
                 computational sprinting --- exceeding sustainable
                 thermal limits for short intervals --- to improve
                 responsiveness in light of the bursty computation
                 demands of many media-rich interactive mobile
                 applications. Computational sprinting improves
                 responsiveness by activating reserve cores (parallel
                 sprinting) and/or boosting frequency/voltage (frequency
                 sprinting) to power levels that far exceed the system's
                 sustainable cooling capabilities, relying on thermal
                 capacitance to buffer heat. Prior work analyzed the
                 feasibility of sprinting through modeling and
                 simulation. In this work, we investigate sprinting
                 using a hardware/software testbed. First, we study
                 unabridged sprints, wherein the computation completes
                 before temperature becomes critical, demonstrating a
                 6.3x responsiveness gain, and a 6\% energy efficiency
                 improvement by racing to idle. We then analyze
                 truncated sprints, wherein our software runtime system
                 must intervene to prevent overheating by throttling
                 parallelism and frequency before the computation is
                 complete. To avoid oversubscription penalties (context
                 switching inefficiencies after a truncated parallel
                 sprint), we develop a sprint-aware task-based parallel
                 runtime. We find that maximal-intensity sprinting is
                 not always best, introduce the concept of sprint
                 pacing, and evaluate an adaptive policy for selecting
                 sprint intensity. We report initial results using a
                 phase change heat sink to extend maximum sprint
                 duration. Finally, we demonstrate that a
                 sprint-and-rest operating regime can actually
                 outperform thermally-limited sustained execution.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS '13 conference proceedings.",
}

@Article{Ahn:2013:DAS,
  author =       "Wonsun Ahn and Yuelu Duan and Josep Torrellas",
  title =        "{DeAliaser}: alias speculation using atomic region
                 support",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "1",
  pages =        "167--180",
  month =        mar,
  year =         "2013",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2490301.2451136",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:40:49 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Alias analysis is a critical component in many
                 compiler optimizations. A promising approach to reduce
                 the complexity of alias analysis is to use speculation.
                 The approach consists of performing optimizations
                 assuming the alias relationships that are true most of
                 the time, and repairing the code when such
                 relationships are found not to hold through runtime
                 checks. This paper proposes a general alias speculation
                 scheme that leverages upcoming hardware support for
                 transactions with the help of some ISA extensions. The
                 ability of transactions to checkpoint and roll back
                 frees the compiler to pursue aggressive optimizations
                 without having to worry about recovery code. Also,
                 exposing the memory conflict detection hardware in
                 transactions to software allows runtime checking of
                 aliases with little or no overhead. We test the
                 potential of the novel alias speculation approach with
                 Loop Invariant Code Motion (LICM), Global Value
                 Numbering (GVN), and Partial Redundancy Elimination
                 (PRE) optimization passes. On average, they are shown
                 to reduce program execution time by 9\% in SPEC FP2006
                 applications and 3\% in SPEC INT2006 applications over
                 the alias analysis of a state-of-the-art compiler.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS '13 conference proceedings.",
}

@Article{Park:2013:RCH,
  author =       "Heekwon Park and Seungjae Baek and Jongmoo Choi and
                 Donghee Lee and Sam H. Noh",
  title =        "Regularities considered harmful: forcing randomness to
                 memory accesses to reduce row buffer conflicts for
                 multi-core, multi-bank systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "1",
  pages =        "181--192",
  month =        mar,
  year =         "2013",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2490301.2451137",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:40:49 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "We propose a novel kernel-level memory allocator,
                 called M$^3$ (M-cube, Multi-core Multi-bank Memory
                 allocator), that has the following two features. First,
                 it introduces and makes use of a notion of a memory
                 container, which is defined as a unit of memory that
                 comprises the minimum number of page frames that can
                 cover all the banks of the memory organization, by
                 exclusively assigning a container to a core so that
                 each core achieves bank parallelism as much as
                 possible. Second, it orchestrates page frame allocation
                 so that pages that threads access are dispersed
                 randomly across multiple banks so that each thread's
                 access pattern is randomized. The development of M$^3$
                 is based on a tool that we develop to fully understand
                 the architectural characteristics of the underlying
                 memory organization. Using an extension of this tool,
                 we observe that the same application that accesses
                 pages in a random manner outperforms one that accesses
                 pages in a regular pattern such as sequential or same
                 ordered accesses. This is because such randomized
                 accesses reduces inter-thread access interference on
                 the row-buffer in memory banks. We implement M$^3$ in
                 the Linux kernel version 2.6.32 on the Intel Xeon
                 system that has 16 cores and 32GB DRAM. Performance
                 evaluation with various workloads show that M$^3$
                 improves the overall performance for memory intensive
                 benchmarks by up to 85\% with an average of about
                 40\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS '13 conference proceedings.",
}

@Article{Honarmand:2013:CUA,
  author =       "Nima Honarmand and Nathan Dautenhahn and Josep
                 Torrellas and Samuel T. King and Gilles Pokam and
                 Cristiano Pereira",
  title =        "{Cyrus}: unintrusive application-level record-replay
                 for replay parallelism",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "1",
  pages =        "193--206",
  month =        mar,
  year =         "2013",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2490301.2451138",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:40:49 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Architectures for deterministic record-replay (R\&R)
                 of multithreaded code are attractive for program
                 debugging, intrusion analysis, and fault-tolerance
                 uses. However, very few of the proposed designs have
                 focused on maximizing replay speed --- a key enabling
                 property of these systems. The few efforts that focus
                 on replay speed require intrusive hardware or software
                 modifications, or target whole-system R\&R rather
                 than the more useful application-level R\&R. This
                 paper presents the first hardware-based scheme for
                 unintrusive, application-level R\&R that explicitly
                 targets high replay speed. Our scheme, called Cyrus,
                 requires no modification to commodity snoopy cache
                 coherence. It introduces the concept of an on-the-fly
                 software Backend Pass during recording which, as the
                 log is being generated, transforms it for high replay
                 parallelism. This pass also fixes-up the log, and can
                 flexibly trade-off replay parallelism for log size. We
                 analyze the performance of Cyrus using full system (OS
                 plus hardware) simulation. Our results show that Cyrus
                 has negligible recording overhead. In addition, for
                 8-processor runs of SPLASH-2, Cyrus attains an average
                 replay parallelism of 5, and a replay speed that is, on
                 average, only about 50\% lower than the recording
                 speed.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS '13 conference proceedings.",
}

@Article{deOliveira:2013:WYS,
  author =       "Augusto Born de Oliveira and Sebastian Fischmeister
                 and Amer Diwan and Matthias Hauswirth and Peter F.
                 Sweeney",
  title =        "Why you should care about quantile regression",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "1",
  pages =        "207--218",
  month =        mar,
  year =         "2013",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2490301.2451140",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:40:49 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Research has shown that correctly conducting and
                 analysing computer performance experiments is
                 difficult. This paper investigates what is necessary to
                 conduct successful computer performance evaluation by
                 attempting to repeat a prior experiment: the comparison
                 between two Linux schedulers. In our efforts, we found
                 that exploring an experimental space through a series
                 of incremental experiments can be inconclusive, and
                 there may be no indication of how much experimentation
                 will be enough. Analysis of variance (ANOVA), a
                 traditional analysis method, is able to partly solve
                 the problems with the previous approach, but we
                 demonstrate that ANOVA can be insufficient for proper
                 analysis due to the requirements it imposes on the
                 data. Finally, we demonstrate the successful
                 application of quantile regression, a recent
                 development in statistics, to computer performance
                 experiments. Quantile regression can provide more
                 insight into the experiment than ANOVA, with the
                 additional benefit of being applicable to data from any
                 distribution. This property makes it especially useful
                 in our field, since non-normally distributed data is
                 common in computer experiments.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS '13 conference proceedings.",
}

@Article{Curtsinger:2013:SSS,
  author =       "Charlie Curtsinger and Emery D. Berger",
  title =        "{STABILIZER}: statistically sound performance
                 evaluation",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "1",
  pages =        "219--228",
  month =        mar,
  year =         "2013",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2490301.2451141",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:40:49 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Researchers and software developers require effective
                 performance evaluation. Researchers must evaluate
                 optimizations or measure overhead. Software developers
                 use automatic performance regression tests to discover
                 when changes improve or degrade performance. The
                 standard methodology is to compare execution times
                 before and after applying changes. Unfortunately,
                 modern architectural features make this approach
                 unsound. Statistically sound evaluation requires
                 multiple samples to test whether one can or cannot
                 (with high confidence) reject the null hypothesis that
                 results are the same before and after. However, caches
                 and branch predictors make performance dependent on
                 machine-specific parameters and the exact layout of
                 code, stack frames, and heap objects. A single binary
                 constitutes just one sample from the space of program
                 layouts, regardless of the number of runs. Since
                 compiler optimizations and code changes also alter
                 layout, it is currently impossible to distinguish the
                 impact of an optimization from that of its layout
                 effects. This paper presents Stabilizer, a system that
                 enables the use of the powerful statistical techniques
                 required for sound performance evaluation on modern
                 architectures. Stabilizer forces executions to sample
                 the space of memory configurations by repeatedly
                 re-randomizing layouts of code, stack, and heap objects
                 at runtime. Stabilizer thus makes it possible to
                 control for layout effects. Re-randomization also
                 ensures that layout effects follow a Gaussian
                 distribution, enabling the use of statistical tests
                 like ANOVA. We demonstrate Stabilizer's efficiency
                 ({$<$7}\% median overhead) and its effectiveness by
                 evaluating the impact of LLVM's optimizations on the
                 SPEC CPU2006 benchmark suite. We find that, while -O2
                 has a significant impact relative to -O1, the
                 performance impact of -O3 over -O2 optimizations is
                 indistinguishable from random noise.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS '13 conference proceedings.",
}

@Article{Gidra:2013:SSS,
  author =       "Lokesh Gidra and Ga{\"e}l Thomas and Julien Sopena and
                 Marc Shapiro",
  title =        "A study of the scalability of stop-the-world garbage
                 collectors on multicores",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "1",
  pages =        "229--240",
  month =        mar,
  year =         "2013",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2490301.2451142",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:40:49 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Large-scale multicore architectures create new
                 challenges for garbage collectors (GCs). In particular,
                 throughput-oriented stop-the-world algorithms
                 demonstrate good performance with a small number of
                 cores, but have been shown to degrade badly beyond
                 approximately 8 cores on a 48-core with OpenJDK 7. This
                 negative result raises the question whether the
                 stop-the-world design has intrinsic limitations that
                 would require a radically different approach. Our study
                 suggests that the answer is no, and that there is no
                 compelling scalability reason to discard the existing
                 highly-optimised throughput-oriented GC code on
                 contemporary hardware. This paper studies the default
                 throughput-oriented garbage collector of OpenJDK 7,
                 called Parallel Scavenge. We identify its bottlenecks,
                 and show how to eliminate them using well-established
                 parallel programming techniques. On the SPECjbb2005,
                 SPECjvm2008 and DaCapo 9.12 benchmarks, the improved GC
                 matches the performance of Parallel Scavenge at low
                 core count, but scales well, up to 48~cores.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS '13 conference proceedings.",
}

@Article{McFarlin:2013:DDO,
  author =       "Daniel S. McFarlin and Charles Tucker and Craig
                 Zilles",
  title =        "Discerning the dominant out-of-order performance
                 advantage: is it speculation or dynamism?",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "1",
  pages =        "241--252",
  month =        mar,
  year =         "2013",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2490301.2451143",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:40:49 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "In this paper, we set out to study the performance
                 advantages of an Out-of-Order (OOO) processor relative
                 to in-order processors with similar execution
                 resources. In particular, we try to tease apart the
                 performance contributions from two sources: the
                 improved schedules enabled by OOO hardware speculation
                 support and its ability to generate different schedules
                 on different occurrences of the same instructions based
                 on operand and functional unit availability. We find
                 that the ability to express good static schedules
                 achieves the bulk of the speedup resulting from OOO.
                 Specifically, of the 53\% speedup achieved by OOO
                 relative to a similarly provisioned in- order machine,
                 we find that 88\% of that speedup can be achieved by
                 using a single ``best'' static schedule as suggested by
                 observing an OOO schedule of the code. We discuss the
                 ISA mechanisms that would be required to express these
                 static schedules. Furthermore, we find that the
                 benefits of dynamism largely come from two kinds of
                 events that influence the application's critical path:
                 load instructions that miss in the cache only part of
                 the time and branch mispredictions. We find that much
                 of the benefit of OOO dynamism can be achieved by the
                 potentially simpler task of addressing these two
                 behaviors directly.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS '13 conference proceedings.",
}

@Article{Checkoway:2013:IAW,
  author =       "Stephen Checkoway and Hovav Shacham",
  title =        "{Iago} attacks: why the system call {API} is a bad
                 untrusted {RPC} interface",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "1",
  pages =        "253--264",
  month =        mar,
  year =         "2013",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2490301.2451145",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:40:49 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "In recent years, researchers have proposed systems for
                 running trusted code on an untrusted operating system.
                 Protection mechanisms deployed by such systems keep a
                 malicious kernel from directly manipulating a trusted
                 application's state. Under such systems, the
                 application and kernel are, conceptually, peers, and
                 the system call API defines an RPC interface between
                 them. We introduce Iago attacks, attacks that a
                 malicious kernel can mount in this model. We show how a
                 carefully chosen sequence of integer return values to
                 Linux system calls can lead a supposedly protected
                 process to act against its interests, and even to
                 undertake arbitrary computation at the malicious
                 kernel's behest. Iago attacks are evidence that
                 protecting applications from malicious kernels is more
                 difficult than previously realized.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS '13 conference proceedings.",
}

@Article{Hofmann:2013:ISA,
  author =       "Owen S. Hofmann and Sangman Kim and Alan M. Dunn and
                 Michael Z. Lee and Emmett Witchel",
  title =        "{InkTag}: secure applications on an untrusted
                 operating system",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "1",
  pages =        "265--278",
  month =        mar,
  year =         "2013",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2490301.2451146",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:40:49 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "InkTag is a virtualization-based architecture that
                 gives strong safety guarantees to high-assurance
                 processes even in the presence of a malicious operating
                 system. InkTag advances the state of the art in
                 untrusted operating systems in both the design of its
                 hypervisor and in the ability to run useful
                 applications without trusting the operating system. We
                 introduce paraverification, a technique that simplifies
                 the InkTag hypervisor by forcing the untrusted
                 operating system to participate in its own
                 verification. Attribute-based access control allows
                 trusted applications to create decentralized access
                 control policies. InkTag is also the first system of
                 its kind to ensure consistency between secure data and
                 metadata, ensuring recoverability in the face of system
                 crashes.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS '13 conference proceedings.",
}

@Article{Giuffrida:2013:SAL,
  author =       "Cristiano Giuffrida and Anton Kuijsten and Andrew S.
                 Tanenbaum",
  title =        "Safe and automatic live update for operating systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "1",
  pages =        "279--292",
  month =        mar,
  year =         "2013",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2490301.2451147",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:40:49 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Increasingly many systems have to run all the time
                 with no downtime allowed. Consider, for example,
                 systems controlling electric power plants and e-banking
                 servers. Nevertheless, security patches and a constant
                 stream of new operating system versions need to be
                 deployed without stopping running programs. These
                 factors naturally lead to a pressing demand for live
                 update---upgrading all or parts of the operating system
                 without rebooting. Unfortunately, existing solutions
                 require significant manual intervention and thus work
                 reliably only for small operating system patches. In
                 this paper, we describe an automated system for live
                 update that can safely and automatically handle major
                 upgrades without rebooting. We have implemented our
                 ideas in Proteos, a new research OS designed with live
                 update in mind. Proteos relies on system support and
                 nonintrusive instrumentation to handle even very
                 complex updates with minimal manual effort. The key
                 novelty is the idea of state quiescence, which allows
                 updates to happen only in safe and predictable system
                 states. A second novelty is the ability to
                 automatically perform transactional live updates at the
                 process level, ensuring a safe and stable update
                 process. Unlike prior solutions, Proteos supports
                 automated state transfer, state checking, and hot
                 rollback. We have evaluated Proteos on 50 real updates
                 and on novel live update scenarios. The results show
                 that our techniques can effectively support both simple
                 and complex updates, while outperforming prior
                 solutions in terms of flexibility, security,
                 reliability, and stability of the update process.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS '13 conference proceedings.",
}

@Article{Mai:2013:VSI,
  author =       "Haohui Mai and Edgar Pek and Hui Xue and Samuel
                 Talmadge King and Parthasarathy Madhusudan",
  title =        "Verifying security invariants in {ExpressOS}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "1",
  pages =        "293--304",
  month =        mar,
  year =         "2013",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2490301.2451148",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:40:49 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Security for applications running on mobile devices is
                 important. In this paper we present ExpressOS, a new OS
                 for enabling high-assurance applications to run on
                 commodity mobile devices securely. Our main
                 contributions are a new OS architecture and our use of
                 formal methods for proving key security invariants
                 about our implementation. In our use of formal methods,
                 we focus solely on proving that our OS implements our
                 security invariants correctly, rather than striving for
                 full functional correctness, requiring significantly
                 less verification effort while still proving the
                 security relevant aspects of our system. We built
                 ExpressOS, analyzed its security, and tested its
                 performance. Our evaluation shows that the performance
                 of ExpressOS is comparable to an Android-based system.
                 In one test, we ran the same web browser on ExpressOS
                 and on an Android-based system, and found that
                 ExpressOS adds 16\% overhead on average to the page
                 load latency time for nine popular web sites.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS '13 conference proceedings.",
}

@Article{Schkufza:2013:SS,
  author =       "Eric Schkufza and Rahul Sharma and Alex Aiken",
  title =        "Stochastic superoptimization",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "1",
  pages =        "305--316",
  month =        mar,
  year =         "2013",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2490301.2451150",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:40:49 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "We formulate the loop-free binary superoptimization
                 task as a stochastic search problem. The competing
                 constraints of transformation correctness and
                 performance improvement are encoded as terms in a cost
                 function, and a Markov Chain Monte Carlo sampler is
                 used to rapidly explore the space of all possible
                 programs to find one that is an optimization of a given
                 target program. Although our method sacrifices
                 completeness, the scope of programs we are able to
                 consider, and the resulting quality of the programs
                 that we produce, far exceed those of existing
                 superoptimizers. Beginning from binaries compiled by
                 llvm -O0 for 64-bit x86, our prototype implementation,
                 STOKE, is able to produce programs which either match
                 or outperform the code produced by gcc -O3, icc -O3,
                 and in some cases, expert handwritten assembly.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS '13 conference proceedings.",
}

@Article{Schulte:2013:ARB,
  author =       "Eric Schulte and Jonathan DiLorenzo and Westley Weimer
                 and Stephanie Forrest",
  title =        "Automated repair of binary and assembly programs for
                 cooperating embedded devices",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "1",
  pages =        "317--328",
  month =        mar,
  year =         "2013",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2490301.2451151",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:40:49 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "We present a method for automatically repairing
                 arbitrary software defects in embedded systems, which
                 have limited memory, disk and CPU capacities, but exist
                 in great numbers. We extend evolutionary computation
                 (EC) algorithms that search for valid repairs at the
                 source code level to assembly and ELF format binaries,
                 compensating for limited system resources with several
                 algorithmic innovations. Our method does not require
                 access to the source code or build toolchain of the
                 software under repair, does not require program
                 instrumentation, specialized execution environments, or
                 virtual machines, or prior knowledge of the bug type.
                 We repair defects in ARM and x86 assembly as well as
                 ELF binaries, observing decreases of 86\% in memory and
                 95\% in disk requirements, with 62\% decrease in repair
                 time, compared to similar source-level techniques.
                 These advances allow repairs previously possible only
                 with C source code to be applied to any ARM or x86
                 assembly or ELF executable. Efficiency gains are
                 achieved by introducing stochastic fault localization,
                 with much lower overhead than comparable deterministic
                 methods, and low-level program representations. When
                 distributed over multiple devices, our algorithm finds
                 repairs faster than predicted by naive parallelism.
                 Four devices using our approach are five times more
                 efficient than a single device because of our
                 collaboration model. The algorithm is implemented on
                 Nokia N900 smartphones, with inter-phone communication
                 fitting in 900 bytes sent in 7 SMS text messages per
                 device per repair on average.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS '13 conference proceedings.",
}

@Article{Cui:2013:VSR,
  author =       "Heming Cui and Gang Hu and Jingyue Wu and Junfeng
                 Yang",
  title =        "Verifying systems rules using rule-directed symbolic
                 execution",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "1",
  pages =        "329--342",
  month =        mar,
  year =         "2013",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2490301.2451152",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:40:49 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Systems code must obey many rules, such as ``opened
                 files must be closed.'' One approach to verifying rules
                 is static analysis, but this technique cannot infer
                 precise runtime effects of code, often emitting many
                 false positives. An alternative is symbolic execution,
                 a technique that verifies program paths over all inputs
                 up to a bounded size. However, when applied to verify
                 rules, existing symbolic execution systems often
                 blindly explore many redundant program paths while
                 missing relevant ones that may contain bugs. Our key
                 insight is that only a small portion of paths are
                 relevant to rules, and the rest (majority) of paths are
                 irrelevant and do not need to be verified. Based on
                 this insight, we create WOODPECKER, a new symbolic
                 execution system for effectively checking rules on
                 systems programs. It provides a set of builtin checkers
                 for common rules, and an interface for users to easily
                 check new rules. It directs symbolic execution toward
                 the program paths relevant to a checked rule, and
                 soundly prunes redundant paths, exponentially speeding
                 up symbolic execution. It is designed to be
                 heuristic-agnostic, enabling users to leverage existing
                 powerful search heuristics. Evaluation on 136 systems
                 programs totaling 545K lines of code, including some of
                 the most widely used programs, shows that, with a time
                 limit of typically just one hour for each verification
                 run, WOODPECKER effectively verifies 28.7\% of the
                 program and rule combinations over bounded input,
                 whereas an existing symbolic execution system KLEE
                 verifies only 8.5\%. For the remaining combinations,
                 WOODPECKER verifies 4.6 times as many relevant paths as
                 KLEE. With a longer time limit, WOODPECKER verifies
                 much more paths than KLEE, e.g., 17 times as many with
                 a fourhour limit. WOODPECKER detects 113 rule
                 violations, including 10 serious data loss errors with
                 2 most serious ones already confirmed by the
                 corresponding developers.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS '13 conference proceedings.",
}

@Article{Xiang:2013:HHO,
  author =       "Xiaoya Xiang and Chen Ding and Hao Luo and Bin Bao",
  title =        "{HOTL}: a higher order theory of locality",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "1",
  pages =        "343--356",
  month =        mar,
  year =         "2013",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2490301.2451153",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:40:49 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "The locality metrics are many, for example, miss ratio
                 to test performance, data footprint to manage cache
                 sharing, and reuse distance to analyze and optimize a
                 program. It is unclear how different metrics are
                 related, whether one subsumes another, and what
                 combination may represent locality completely. This
                 paper first derives a set of formulas to convert
                 between five locality metrics and gives the condition
                 for correctness. The transformation is analogous to
                 differentiation and integration used to convert between
                 higher order polynomials. As a result, these metrics
                 can be assigned an order and organized into a
                 hierarchy. Using the new theory, the paper then
                 develops two techniques: one measures the locality in
                 real time without special hardware support, and the
                 other predicts multicore cache interference without
                 parallel testing. The paper evaluates them using
                 sequential and parallel programs as well as for a
                 parallel mix of sequential programs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS '13 conference proceedings.",
}

@Article{Kang:2013:HPP,
  author =       "Hui Kang and Jennifer L. Wong",
  title =        "To hardware prefetch or not to prefetch?: a
                 virtualized environment study and core binding
                 approach",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "1",
  pages =        "357--368",
  month =        mar,
  year =         "2013",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2490301.2451155",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:40:49 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Most hardware and software vendors suggest disabling
                 hardware prefetching in virtualized environments. They
                 claim that prefetching is detrimental to application
                 performance due to inaccurate prediction caused by
                 workload diversity and VM interference on shared cache.
                 However, no comprehensive or quantitative measurements
                 to support this belief have been performed. This paper
                 is the first to systematically measure the influence of
                 hardware prefetching in virtualized environments. We
                 examine a wide variety of benchmarks on three types of
                 chip-multiprocessors (CMPs) to analyze the hardware
                 prefetching performance. We conduct extensive
                 experiments by taking into account a number of
                 important virtualization factors. We find that hardware
                 prefetching has minimal destructive influence under
                 most configurations. Only with certain application
                 combinations does prefetching influence the overall
                 performance. To leverage these findings and make
                 hardware prefetching effective across a diversity of
                 virtualized environments, we propose a dynamic
                 prefetching-aware VCPU-core binding approach (PAVCB),
                 which includes two phases --- classifying and binding.
                 The workload of each VM is classified into different
                 cache sharing constraint categories based upon its
                 cache access characteristics, considering both prefetch
                 requests and demand requests. Then following heuristic
                 rules, the VCPUs of each VM are scheduled onto
                 appropriate cores subject to cache sharing constraints.
                 We show that the proposed approach can improve
                 performance by 12\% on average over the default
                 scheduler and 46\% over manual system administrator
                 bindings across different workload combinations in the
                 presence of hardware prefetching.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS '13 conference proceedings.",
}

@Article{Kim:2013:DBC,
  author =       "Hwanju Kim and Sangwook Kim and Jinkyu Jeong and
                 Joonwon Lee and Seungryoul Maeng",
  title =        "Demand-based coordinated scheduling for {SMP VMs}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "1",
  pages =        "369--380",
  month =        mar,
  year =         "2013",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2490301.2451156",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:40:49 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "As processor architectures have been enhancing their
                 computing capacity by increasing core counts,
                 independent workloads can be consolidated on a single
                 node for the sake of high resource efficiency in data
                 centers. With the prevalence of virtualization
                 technology, each individual workload can be hosted on a
                 virtual machine for strong isolation between co-located
                 workloads. Along with this trend, hosted applications
                 have increasingly been multithreaded to take advantage
                 of improved hardware parallelism. Although the
                 performance of many multithreaded applications highly
                 depends on communication (or synchronization) latency,
                 existing schemes of virtual machine scheduling do not
                 explicitly coordinate virtual CPUs based on their
                 communication behaviors. This paper presents a
                 demand-based coordinated scheduling scheme for
                 consolidated virtual machines that host multithreaded
                 workloads. To this end, we propose communication-driven
                 scheduling that controls time-sharing in response to
                 inter-processor interrupts (IPIs) between virtual CPUs.
                 On the basis of in-depth analysis on the relationship
                 between IPI communications and coordination demands, we
                 devise IPI-driven coscheduling and delayed preemption
                 schemes, which effectively reduce synchronization
                 latency and unnecessary CPU consumption. In addition,
                 we introduce a load-conscious CPU allocation policy in
                 order to address load imbalance in heterogeneously
                 consolidated environments. The proposed schemes are
                 evaluated with respect to various scenarios of mixed
                 workloads using the PARSEC multithreaded applications.
                 In the evaluation, our scheme improves the overall
                 performance of consolidated workloads, especially
                 communication-intensive applications, by reducing
                 inefficient synchronization latency.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS '13 conference proceedings.",
}

@Article{Dashti:2013:TMH,
  author =       "Mohammad Dashti and Alexandra Fedorova and Justin
                 Funston and Fabien Gaud and Renaud Lachaize and
                 Baptiste Lepers and Vivien Quema and Mark Roth",
  title =        "Traffic management: a holistic approach to memory
                 placement on {NUMA} systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "1",
  pages =        "381--394",
  month =        mar,
  year =         "2013",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2490301.2451157",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:40:49 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "NUMA systems are characterized by Non-Uniform Memory
                 Access times, where accessing data in a remote node
                 takes longer than a local access. NUMA hardware has
                 been built since the late 80's, and the operating
                 systems designed for it were optimized for access
                 locality. They co-located memory pages with the threads
                 that accessed them, so as to avoid the cost of remote
                 accesses. Contrary to older systems, modern NUMA
                 hardware has much smaller remote wire delays, and so
                 remote access costs per se are not the main concern for
                 performance, as we discovered in this work. Instead,
                 congestion on memory controllers and interconnects,
                 caused by memory traffic from data-intensive
                 applications, hurts performance a lot more. Because of
                 that, memory placement algorithms must be redesigned to
                 target traffic congestion. This requires an arsenal of
                 techniques that go beyond optimizing locality. In this
                 paper we describe Carrefour, an algorithm that
                 addresses this goal. We implemented Carrefour in Linux
                 and obtained performance improvements of up to 3.6
                 relative to the default kernel, as well as significant
                 improvements compared to NUMA-aware patchsets available
                 for Linux. Carrefour never hurts performance by more
                 than 4\% when memory placement cannot be improved. We
                 present the design of Carrefour, the challenges of
                 implementing it on modern hardware, and draw insights
                 about hardware support that would help optimize system
                 software on future NUMA systems.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS '13 conference proceedings.",
}

@Article{Jog:2013:OCT,
  author =       "Adwait Jog and Onur Kayiran and Nachiappan Chidambaram
                 Nachiappan and Asit K. Mishra and Mahmut T. Kandemir
                 and Onur Mutlu and Ravishankar Iyer and Chita R. Das",
  title =        "{OWL}: cooperative thread array aware scheduling
                 techniques for improving {GPGPU} performance",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "1",
  pages =        "395--406",
  month =        mar,
  year =         "2013",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2490301.2451158",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:40:49 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Emerging GPGPU architectures, along with programming
                 models like CUDA and OpenCL, offer a cost-effective
                 platform for many applications by providing high thread
                 level parallelism at lower energy budgets.
                 Unfortunately, for many general-purpose applications,
                 available hardware resources of a GPGPU are not
                 efficiently utilized, leading to lost opportunity in
                 improving performance. A major cause of this is the
                 inefficiency of current warp scheduling policies in
                 tolerating long memory latencies. In this paper, we
                 identify that the scheduling decisions made by such
                 policies are agnostic to thread-block, or cooperative
                 thread array (CTA), behavior, and as a result
                 inefficient. We present a coordinated CTA-aware
                 scheduling policy that utilizes four schemes to
                 minimize the impact of long memory latencies. The first
                 two schemes, CTA-aware two-level warp scheduling and
                 locality aware warp scheduling, enhance per-core
                 performance by effectively reducing cache contention
                 and improving latency hiding capability. The third
                 scheme, bank-level parallelism aware warp scheduling,
                 improves overall GPGPU performance by enhancing DRAM
                 bank-level parallelism. The fourth scheme employs
                 opportunistic memory-side prefetching to further
                 enhance performance by taking advantage of open DRAM
                 rows. Evaluations on a 28-core GPGPU platform with
                 highly memory-intensive applications indicate that our
                 proposed mechanism can provide 33\% average performance
                 improvement compared to the commonly-employed
                 round-robin warp scheduling policy.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS '13 conference proceedings.",
}

@Article{Pai:2013:IGC,
  author =       "Sreepathi Pai and Matthew J. Thazhuthaveetil and R.
                 Govindarajan",
  title =        "Improving {GPGPU} concurrency with elastic kernels",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "1",
  pages =        "407--418",
  month =        mar,
  year =         "2013",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2490301.2451160",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:40:49 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Each new generation of GPUs vastly increases the
                 resources available to GPGPU programs. GPU programming
                 models (like CUDA) were designed to scale to use these
                 resources. However, we find that CUDA programs actually
                 do not scale to utilize all available resources, with
                 over 30\% of resources going unused on average for
                 programs of the Parboil2 suite that we used in our
                 work. Current GPUs therefore allow concurrent execution
                 of kernels to improve utilization. In this work, we
                 study concurrent execution of GPU kernels using
                 multiprogram workloads on current NVIDIA Fermi GPUs. On
                 two-program workloads from the Parboil2 benchmark suite
                 we find concurrent execution is often no better than
                 serialized execution. We identify that the lack of
                 control over resource allocation to kernels is a major
                 serialization bottleneck. We propose transformations
                 that convert CUDA kernels into elastic kernels which
                 permit fine-grained control over their resource usage.
                 We then propose several elastic-kernel aware
                 concurrency policies that offer significantly better
                 performance and concurrency compared to the current
                 CUDA policy. We evaluate our proposals on real hardware
                 using multiprogrammed workloads constructed from
                 benchmarks in the Parboil 2 suite. On average, our
                 proposals increase system throughput (STP) by 1.21x and
                 improve the average normalized turnaround time (ANTT)
                 by 3.73x for two-program workloads when compared to the
                 current CUDA concurrency implementation.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS '13 conference proceedings.",
}

@Article{Oh:2013:PAL,
  author =       "Taewook Oh and Hanjun Kim and Nick P. Johnson and Jae
                 W. Lee and David I. August",
  title =        "Practical automatic loop specialization",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "1",
  pages =        "419--430",
  month =        mar,
  year =         "2013",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2490301.2451161",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:40:49 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Program specialization optimizes a program with
                 respect to program invariants, including known, fixed
                 inputs. These invariants can be used to enable
                 optimizations that are otherwise unsound. In many
                 applications, a program input induces predictable
                 patterns of values across loop iterations, yet existing
                 specializers cannot fully capitalize on this
                 opportunity. To address this limitation, we present
                 Invariant-induced Pattern based Loop Specialization
                 (IPLS), the first fully-automatic specialization
                 technique designed for everyday use on real
                 applications. Using dynamic information-flow tracking,
                 IPLS profiles the values of instructions that depend
                 solely on invariants and recognizes repeating patterns
                 across multiple iterations of hot loops. IPLS then
                 specializes these loops, using those patterns to
                 predict values across a large window of loop
                 iterations. This enables aggressive optimization of the
                 loop; conceptually, this optimization reconstructs
                 recurring patterns induced by the input as concrete
                 loops in the specialized binary. IPLS specializes
                 real-world programs that prior techniques fail to
                 specialize without requiring hints from the user.
                 Experiments demonstrate a geomean speedup of 14.1\%
                 with a maximum speedup of 138\% over the original codes
                 when evaluated on three script interpreters and eleven
                 scripts each.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS '13 conference proceedings.",
}

@Article{Phothilimthana:2013:PPH,
  author =       "Phitchaya Mangpo Phothilimthana and Jason Ansel and
                 Jonathan Ragan-Kelley and Saman Amarasinghe",
  title =        "Portable performance on heterogeneous architectures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "1",
  pages =        "431--444",
  month =        mar,
  year =         "2013",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2490301.2451162",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:40:49 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Trends in both consumer and high performance computing
                 are bringing not only more cores, but also increased
                 heterogeneity among the computational resources within
                 a single machine. In many machines, one of the greatest
                 computational resources is now their graphics
                 coprocessors (GPUs), not just their primary CPUs. But
                 GPU programming and memory models differ dramatically
                 from conventional CPUs, and the relative performance
                 characteristics of the different processors vary widely
                 between machines. Different processors within a system
                 often perform best with different algorithms and memory
                 usage patterns, and achieving the best overall
                 performance may require mapping portions of programs
                 across all types of resources in the machine. To
                 address the problem of efficiently programming machines
                 with increasingly heterogeneous computational
                 resources, we propose a programming model in which the
                 best mapping of programs to processors and memories is
                 determined empirically. Programs define choices in how
                 their individual algorithms may work, and the compiler
                 generates further choices in how they can map to CPU
                 and GPU processors and memory systems. These choices
                 are given to an empirical autotuning framework that
                 allows the space of possible implementations to be
                 searched at installation time. The rich choice space
                 allows the autotuner to construct poly-algorithms that
                 combine many different algorithmic techniques, using
                 both the CPU and the GPU, to obtain better performance
                 than any one technique alone. Experimental results show
                 that algorithmic changes, and the varied use of both
                 CPUs and GPUs, are necessary to obtain up to a 16.5x
                 speedup over using a single program configuration for
                 all architectures.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS '13 conference proceedings.",
}

@Article{Mittal:2013:EVE,
  author =       "Aashish Mittal and Dushyant Bansal and Sorav Bansal
                 and Varun Sethi",
  title =        "Efficient virtualization on embedded {Power
                 Architecture\reg} platforms",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "1",
  pages =        "445--458",
  month =        mar,
  year =         "2013",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2490301.2451163",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:40:49 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Power Architecture\reg{} processors are popular and
                 widespread on embedded systems, and such platforms are
                 increasingly being used to run virtual machines. While
                 the Power Architecture meets the Popek-and-Goldberg
                 virtualization requirements for traditional
                 trap-and-emulate style virtualization, the performance
                 overhead of virtualization remains high. For example,
                 workloads exhibiting a large amount of kernel activity
                 typically show 3-5x slowdowns over bare-metal. Recent
                 additions to the Linux kernel contain guest and host
                 side paravirtual extensions for Power Architecture
                 platforms. While these extensions improve performance
                 significantly, they are guest-specific,
                 guest-intrusive, and cover only a subset of all
                 possible virtualization optimizations. We present a set
                 of host-side optimizations that achieve comparable
                 performance to the aforementioned paravirtual
                 extensions, on an unmodified guest. Our optimizations
                 are based on adaptive in-place binary translation.
                 Unlike the paravirtual approach, our solution is guest
                 neutral. We implement our ideas in a prototype based on
                 Qemu/KVM. After our modifications, KVM can boot an
                 unmodified Linux guest around 2.5x faster. We contrast
                 our optimization approach with previous similar binary
                 translation based approaches for the x86 architecture;
                 in our experience, each architecture presents a unique
                 set of challenges and optimization opportunities.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS '13 conference proceedings.",
}

@Article{Hill:2013:RDC,
  author =       "Mark D. Hill",
  title =        "Research directions for 21st century computer systems:
                 {ASPLOS 2013} panel",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "1",
  pages =        "459--460",
  month =        mar,
  year =         "2013",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2490301.2451165",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:40:49 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Four recent efforts call out architectural challenges
                 and opportunities up and down the software/hardware
                 stack. This panel will discuss, ``What should the
                 community do to facilitate, transcend, or refute these
                 partially overlapping visions?'' The panel is chaired
                 by Mark D. Hill with other panel members not finalized
                 for the ASPLOS'13 proceedings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS '13 conference proceedings.",
}

@Article{Madhavapeddy:2013:ULO,
  author =       "Anil Madhavapeddy and Richard Mortier and Charalampos
                 Rotsos and David Scott and Balraj Singh and Thomas
                 Gazagnaire and Steven Smith and Steven Hand and Jon
                 Crowcroft",
  title =        "Unikernels: library operating systems for the cloud",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "1",
  pages =        "461--472",
  month =        mar,
  year =         "2013",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2490301.2451167",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:40:49 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "We present unikernels, a new approach to deploying
                 cloud services via applications written in high-level
                 source code. Unikernels are single-purpose appliances
                 that are compile-time specialised into standalone
                 kernels, and sealed against modification when deployed
                 to a cloud platform. In return they offer significant
                 reduction in image sizes, improved efficiency and
                 security, and should reduce operational costs. Our
                 Mirage prototype compiles OCaml code into unikernels
                 that run on commodity clouds and offer an order of
                 magnitude reduction in code size without significant
                 performance penalty. The architecture combines static
                 type-safety with a single address-space layout that can
                 be made immutable via a hypervisor extension. Mirage
                 contributes a suite of type-safe protocol libraries,
                 and our results demonstrate that the hypervisor is a
                 platform that overcomes the hardware compatibility
                 issues that have made past library operating systems
                 impractical to deploy in the real-world.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS '13 conference proceedings.",
}

@Article{Kadav:2013:FGF,
  author =       "Asim Kadav and Matthew J. Renzelmann and Michael M.
                 Swift",
  title =        "Fine-grained fault tolerance using device
                 checkpoints",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "1",
  pages =        "473--484",
  month =        mar,
  year =         "2013",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2490301.2451168",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:40:49 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Recovering faults in drivers is difficult compared to
                 other code because their state is spread across both
                 memory and a device. Existing driver fault-tolerance
                 mechanisms either restart the driver and discard its
                 state, which can break applications, or require an
                 extensive logging mechanism to replay requests and
                 recreate driver state. Even logging may be
                 insufficient, though, if the semantics of requests are
                 ambiguous. In addition, these systems either require
                 large subsystems that must be kept up-to-date as the
                 kernel changes, or require substantial rewriting of
                 drivers. We present a new driver fault-tolerance
                 mechanism that provides fine-grained control over the
                 code protected. Fine-Grained Fault Tolerance (FGFT)
                 isolates driver code at the granularity of a single
                 entry point. It executes driver code as a transaction,
                 allowing roll back if the driver fails. We develop a
                 novel checkpointing mechanism to save and restore
                 device state using existing power management code.
                 Unlike past systems, FGFT can be incrementally deployed
                 in a single driver without the need for a large kernel
                 subsystem, but at the cost of small modifications to
                 the driver. In the evaluation, we show that FGFT can
                 have almost zero runtime cost in many cases, and that
                 checkpoint-based recovery can reduce the duration of a
                 failure by 79\% compared to restarting the driver.
                 Finally, we show that applying FGFT to a driver
                 requires little effort, and the majority of drivers in
                 common classes already contain the power-management
                 code needed for checkpoint/restore.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS '13 conference proceedings.",
}

@Article{Silberstein:2013:GIF,
  author =       "Mark Silberstein and Bryan Ford and Idit Keidar and
                 Emmett Witchel",
  title =        "{GPUfs}: integrating a file system with {GPUs}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "1",
  pages =        "485--498",
  month =        mar,
  year =         "2013",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2490301.2451169",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:40:49 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "PU hardware is becoming increasingly general purpose,
                 quickly outgrowing the traditional but constrained
                 GPU-as-coprocessor programming model. To make GPUs
                 easier to program and easier to integrate with existing
                 systems, we propose making the host's file system
                 directly accessible from GPU code. GPUfs provides a
                 POSIX-like API for GPU programs, exploits GPU
                 parallelism for efficiency, and optimizes GPU file
                 access by extending the buffer cache into GPU memory.
                 Our experiments, based on a set of real benchmarks
                 adopted to use our file system, demonstrate the
                 feasibility and benefits of our approach. For example,
                 we demonstrate a simple self-contained GPU program
                 which searches for a set of strings in the entire tree
                 of Linux kernel source files over seven times faster
                 than an eight-core CPU run.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS '13 conference proceedings.",
}

@Article{Hunt:2013:DTN,
  author =       "Nicholas Hunt and Tom Bergan and Luis Ceze and Steven
                 D. Gribble",
  title =        "{DDOS}: taming nondeterminism in distributed systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "1",
  pages =        "499--508",
  month =        mar,
  year =         "2013",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2490301.2451170",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:40:49 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Nondeterminism complicates the development and
                 management of distributed systems, and arises from two
                 main sources: the local behavior of each individual
                 node as well as the behavior of the network connecting
                 them. Taming nondeterminism effectively requires
                 dealing with both sources. This paper proposes DDOS, a
                 system that leverages prior work on deterministic
                 multithreading to offer: (1) space-efficient
                 record/replay of distributed systems; and (2) fully
                 deterministic distributed behavior. Leveraging
                 deterministic behavior at each node makes outgoing
                 messages strictly a function of explicit inputs. This
                 allows us to record the system by logging just
                 message's arrival time, not the contents. Going
                 further, we propose and implement an algorithm that
                 makes all communication between nodes deterministic by
                 scheduling communication onto a global logical
                 timeline. We implement both algorithms in a system
                 called DDOS and evaluate our system with parallel
                 scientific applications, an HTTP/memcached system and a
                 distributed microbenchmark with a high volume of
                 peer-to-peer communication. Our results show up to two
                 orders of magnitude reduction in log size of
                 record/replay, and that distributed systems can be made
                 deterministic with an order of magnitude of overhead.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS '13 conference proceedings.",
}

@Article{Wang:2013:TEH,
  author =       "Cheng Wang and Youfeng Wu",
  title =        "{TSO\_ATOMICITY}: efficient hardware primitive for
                 {TSO}-preserving region optimizations",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "1",
  pages =        "509--520",
  month =        mar,
  year =         "2013",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2490301.2451172",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:40:49 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Program optimizations based on data dependences may
                 not preserve the memory consistency in the programs.
                 Previous works leverage a hardware ATOMICITY primitive
                 to restrict the thread interleaving for preserving
                 sequential consistency in region optimizations.
                 However, ATOMICITY primitive is over restrictive on the
                 thread interleaving for optimizing real-world
                 applications developed with the popular
                 Total-Store-Ordering (TSO) memory consistency, which is
                 weaker than sequential consistency. In this paper, we
                 present a novel hardware TSO\_ATOMICITY primitive,
                 which has less restriction on the thread interleaving
                 than ATOMICITY primitive to permit more efficient
                 program execution than ATOMICITY primitive, but can
                 still preserve TSO memory consistency in all region
                 optimizations. Furthermore, TSO_ATOMICITY primitive
                 requires similar architecture support as ATOMICITY
                 primitive and can be implemented with only slight
                 change to the existing ATOMICITY primitive
                 implementation. Our experimental results show that in a
                 start-of-art dynamic binary optimization system on a
                 large set of workloads, ATOMICITY primitive can only
                 improve the performance by 4\% on average.
                 TSO_ATOMICITY primitive can reduce the overhead
                 associated with ATOMICITY primitive and improve the
                 performance by 12\% on average.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS '13 conference proceedings.",
}

@Article{Jafri:2013:WGI,
  author =       "Syed Ali Raza Jafri and Gwendolyn Voskuilen and T. N.
                 Vijaykumar",
  title =        "{Wait-n-GoTM}: improving {HTM} performance by
                 serializing cyclic dependencies",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "1",
  pages =        "521--534",
  month =        mar,
  year =         "2013",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2490301.2451173",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:40:49 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Transactional memory (TM) has been proposed to
                 alleviate some key programmability problems in chip
                 multiprocessors. Most TMs optimistically allow
                 concurrent transactions, detecting read-write or
                 write-write conflicts. Upon conflicts, existing
                 hardware TMs (HTMs) use one of three
                 conflict-resolution policies: (1) always-abort, (2)
                 always-wait for some conflicting transactions to
                 complete, or (3) always-go past conflicts and resolve
                 acyclic conflicts at commit or abort upon cyclic
                 dependencies. While each policy has advantages, the
                 policies degrade performance under contention by
                 limiting concurrency (always-abort, always-wait) or
                 incurring late aborts due to cyclic dependencies
                 (always-go). Thus, while always-go avoids acyclic
                 aborts, no policy avoids cyclic aborts. We propose
                 Wait-n-GoTM (WnGTM) to increase concurrency while
                 avoiding cyclic aborts. We observe that most cyclic
                 dependencies are caused by threads interleaving
                 multiple accesses to a few heavily-read-write-shared
                 delinquent data cache blocks. These accesses occur in
                 code sections called cycle inducer sections (CISTs).
                 Accordingly, we propose Wait-n-Go (WnG)
                 conflict-resolution to avoid many cyclic aborts by
                 predicting and serializing the CISTs. To support the
                 WnG policy, we extend previous HTMs to (1) allow
                 multiple readers and writers, (2) scalably identify
                 dependencies, and (3) detect cyclic dependencies via
                 new mechanisms, namely, conflict transactional state,
                 order-capture, and hardware timestamps, respectively.
                 In 16-core simulations of STAMP, WnGTM achieves average
                 speedups of 46\% for higher-contention benchmarks and
                 28\% for all benchmarks over always-abort (TokenTM)
                 with low-contention benchmarks remaining unchanged,
                 compared to always-go (DATM) and always-wait
                 (LogTM-SE), which perform worse than and 6\% better
                 than TokenTM, respectively.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS '13 conference proceedings.",
}

@Article{Qian:2013:VSP,
  author =       "Xuehai Qian and Josep Torrellas and Benjamin Sahelices
                 and Depei Qian",
  title =        "Volition: scalable and precise sequential consistency
                 violation detection",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "1",
  pages =        "535--548",
  month =        mar,
  year =         "2013",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2490301.2451174",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:40:49 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Sequential Consistency (SC) is the most intuitive
                 memory model, and SC Violations (SCVs) produce
                 unintuitive, typically incorrect executions. Most prior
                 SCV detection schemes have used data races as proxies
                 for SCVs, which is highly imprecise. Other schemes that
                 have targeted data-race cycles are either too
                 conservative or are designed only for two-processor
                 cycles and snoopy-based systems. This paper presents
                 Volition, the first hardware scheme that detects SCVs
                 in a relaxed-consistency machine precisely, in a
                 scalable manner, and for an arbitrary number of
                 processors in the cycle. Volition leverages cache
                 coherence protocol transactions to dynamically detect
                 cycles in memory-access orders across threads. When a
                 cycle is about to occur, an exception is triggered.
                 Volition can be used in both directory- and
                 snoopy-based coherence protocols. Our simulations of
                 Volition in a 64-processor multicore with
                 directory-based coherence running SPLASH-2 and Parsec
                 programs shows that Volition induces negligible traffic
                 and execution overhead. In addition, it can detect SCVs
                 with several processors. Volition is suitable for
                 on-the-fly use.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS '13 conference proceedings.",
}

@Article{Grossman:2013:HSF,
  author =       "J. P. Grossman and Jeffrey S. Kuskin and Joseph A.
                 Bank and Michael Theobald and Ron O. Dror and Douglas
                 J. Ierardi and Richard H. Larson and U. Ben Schafer and
                 Brian Towles and Cliff Young and David E. Shaw",
  title =        "Hardware support for fine-grained event-driven
                 computation in {Anton 2}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "1",
  pages =        "549--560",
  month =        mar,
  year =         "2013",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2490301.2451175",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:40:49 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Exploiting parallelism to accelerate a computation
                 typically involves dividing it into many small tasks
                 that can be assigned to different processing elements.
                 An efficient execution schedule for these tasks can be
                 difficult or impossible to determine in advance,
                 however, if there is uncertainty as to when each task's
                 input data will be available. Ideally, each task would
                 run in direct response to the arrival of its input
                 data, thus allowing the computation to proceed in a
                 fine-grained event-driven manner. Realizing this ideal
                 is difficult in practice, and typically requires
                 sacrificing flexibility for performance. In Anton 2, a
                 massively parallel special-purpose supercomputer for
                 molecular dynamics simulations, we addressed this
                 challenge by including a hardware block, called the
                 dispatch unit, that provides flexible and efficient
                 support for fine-grained event-driven computation. Its
                 novel features include a many-to-many mapping from
                 input data to a set of synchronization counters, and
                 the ability to prioritize tasks based on their type. To
                 solve the additional problem of using a fixed set of
                 synchronization counters to track input data for a
                 potentially large number of tasks, we created a
                 software library that allows programmers to treat Anton
                 2 as an idealized machine with infinitely many
                 synchronization counters. The dispatch unit, together
                 with this library, made it possible to simplify our
                 molecular dynamics software by expressing it as a
                 collection of independent tasks, and the resulting
                 fine-grained execution schedule improved overall
                 performance by up to 16\% relative to a coarse-grained
                 schedule for precisely the same computation.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS '13 conference proceedings.",
}

@Article{Sinha:2013:NRA,
  author =       "Amitabha Sinha and Mitrava Sarkar and Soumojit
                 Acharyya and Suranjan Chakraborty",
  title =        "A novel reconfigurable architecture of a {DSP}
                 processor for efficient mapping of {DSP} functions
                 using field programmable {DSP} arrays",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "2",
  pages =        "1--8",
  month =        may,
  year =         "2013",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2490302.2490304",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Sat Jun 1 11:00:26 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Development of modern integrated circuit technologies
                 makes it feasible to develop cheaper, faster and
                 smaller special purpose signal processing function
                 circuits. Digital Signal processing functions are
                 generally implemented either on ASICs with
                 inflexibility, or on FPGAs with bottlenecks of
                 relatively smaller utilization factor or lower speed
                 compared to ASIC. Field Programmable DSP Array (FPDA)
                 is the proposed DSP dedicated device, redolent to FPGA,
                 but with basic fixed common modules (CMs) (like adders,
                 subtractors, multipliers, scaling units, shifters)
                 instead of CLBs. This paper introduces the development
                 of reconfigurable system architecture with a focus on
                 FPDA that integrates different DSP functions like DFT,
                 FFT, DCT, FIR, IIR, and DWT etc. The switching between
                 DSP functions is occurred by reconfiguring the
                 interconnection between CMs. Validation of the proposed
                 architecture has been achieved on Virtex5 FPGA. The
                 architecture provides sufficient amount of flexibility,
                 parallelism and scalability.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Saha:2013:PAF,
  author =       "Amrita Saha and Manideepa Mukherjee and Debanjana
                 Datta and Sangita Saha and Amitabha Sinha",
  title =        "Performance analysis of a {FPGA} based novel binary
                 and {DBNS} multiplier",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "2",
  pages =        "9--16",
  month =        may,
  year =         "2013",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2490302.2490305",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Sat Jun 1 11:00:26 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Designing high performance Software Defined Radio
                 (SDR) with low power and flexibility is a major
                 challenge. While the high performance DSP processors
                 are unable to meet the speed requirements of these
                 SDRs, System on chips (SOCs) are also not suitable
                 because of their limited flexibility. Recently
                 dynamically reconfigurable FPGAs have emerged as high
                 performance programmable hardware to execute highly
                 parallel, computationally intensive signal processing
                 functions efficiently. Since basic intention of an SDR
                 is to implement different modulation / demodulation
                 schemes and basic building blocks for such schemes are
                 signal processing functions, FPGAs have become an
                 important component for implementing these. However,
                 the effectiveness of such an approach with respect to
                 cost, performance and flexibility need to be examined.
                 Double Base Number Systems (DBNS) have been gaining
                 attention for compute intensive applications in signal
                 processing because of their higher performance in
                 arithmetic operations in general and particularly
                 multiplication. Keeping these issues in view, this
                 paper aims to present a new Software defined Radio. To
                 Enhance the performance of the proposed architecture ,
                 analysis have been done employing both single index and
                 multiple indices DBNS multipliers. Experiments and
                 analysis on performance have also been done with its
                 binary counterpart. Both DBNS and binary based
                 architecture were implemented on Xilinx virtex iv FPGA
                 using xilinx ISE 9.1 i.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Sartin-Tarm:2013:CCS,
  author =       "Michael Sartin-Tarm and Tony Nowatzki and Lorenzo {De
                 Carli} and Karthikeyan Sankaralingam and Cristian
                 Estan",
  title =        "Constraint centric scheduling guide",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "2",
  pages =        "17--21",
  month =        may,
  year =         "2013",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2490302.2490306",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Sat Jun 1 11:00:26 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "The advent of architectures with software-exposed
                 resources (Spatial Architectures) has created a demand
                 for universally applicable scheduling techniques. This
                 paper describes our generalized spatial scheduling
                 framework, formulated with Integer Linear Programming,
                 and specifically accomplishes two goals. First, using
                 the ``Simple'' architecture, it illustrates how to use
                 our open-source tool to create a customized scheduler
                 and covers problem formulation with ILP and GAMS.
                 Second, it summarizes results on the application to
                 three real architectures (TRIPS,DySER,PLUG),
                 demonstrating the technique's practicality and
                 competitiveness with existing schedulers.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Guha:2013:SEW,
  author =       "Apala Guha and Yao Zhang and Raihan ur Rasool and
                 Andrew A. Chien",
  title =        "Systematic evaluation of workload clustering for
                 extremely energy-efficient architectures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "2",
  pages =        "22--29",
  month =        may,
  year =         "2013",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2490302.2490307",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Sat Jun 1 11:00:26 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Chip power consumption has reached its limits, leading
                 to the flattening of single-core performance. We
                 propose the $10 \times 10$ processor, a federated heterogeneous
                 multi-core architecture, where each core is an ensemble
                 of u-engines (micro-engines, similar to accelerators)
                 specialized for different workload groups to achieve
                 dramatically higher energy efficiency. The u-engines
                 collectively target the entire general-purpose workload
                 space. The problem we study in this article is
                 selecting the set of workloads that each u-engine
                 should be customized for. For this problem we study the
                 computation structure of a wide variety of workloads
                 and cluster together workloads with similar computation
                 structures, the idea being that each u-engine will be
                 customized for the compute structures exhibited by a
                 particular cluster. The constraint on this problem is
                 the silicon budget of a processor. Lower silicon
                 budgets accommodate fewer u-engines and require
                 individual u-engines to target larger segments of the
                 workload space which leads to lower energy efficiency
                 benefits from customization, because there is more
                 variation among the compute structures making up each
                 cluster. Therefore, we also study how workload coverage
                 and benefit can be maximized for a given silicon
                 budget. We study a broad general-purpose workload that
                 includes 34 codes from 6 benchmark suites, identifying
                 the most frequent functions, and clustering them based
                 on two sets of instruction usage features
                 (high-resolution and low-resolution) into 8, 16, 32,
                 64, 128 clusters respectively. We develop abstract
                 metrics (coverage and weighted customization benefit)
                 to evaluate the clusters. We show significant potential
                 payoffs with four benefit models: 2-3x (square root
                 model), 4-10x (linear model), 12-24x (quadratic model),
                 and 22-26x (cubic model).",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Saha:2013:IDP,
  author =       "Amrita Saha and Pijush Biswas and Amitabha Sinha",
  title =        "An integrated development platform of a reconfigurable
                 radio processor for software defined radio",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "2",
  pages =        "30--35",
  month =        may,
  year =         "2013",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2490302.2490308",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Sat Jun 1 11:00:26 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Performance required by ``Software Defined Radio
                 (SDR)'' poses many challenges in real-time applications
                 because of their high computational complexity.
                 Designing a high performance SDR with a high degree of
                 flexibility becomes an issue of importance. While the
                 fastest programmable DSP processors are unable to meet
                 the speed requirements for SDR, FPGAs also cannot offer
                 the highest possible performance at the lowest silicon
                 cost for a given signal processing function. Moreover,
                 they are not optimized for radio applications because
                 of their LUT based approach. To overcome the
                 limitations of both DSP Processor and FPGAs, Radio
                 Processor, a reconfigurable Processor optimized for
                 Radio applications was conceived.[14],[17]. However,
                 advantages of this Radio Processor cannot be made
                 useful unless there is an integrated development
                 environment to develop SDR. This paper addresses these
                 issues by introducing a new Integrated Development
                 platform for reconfigurable ``Radio Processor'' for
                 implementing SDR.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Pal:2013:FIN,
  author =       "Santanu Pal and Amitabha Sinha and Pijush Biswas",
  title =        "{FPGA} implementation of a novel {DCT} architecture
                 reducing constant cosine terms",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "2",
  pages =        "36--40",
  month =        may,
  year =         "2013",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2490302.2490309",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Sat Jun 1 11:00:26 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "This paper presents a new scalable architecture for
                 Discrete Cosine Transform (DCT). In contrast to the
                 conventional DCT architecture, the proposed
                 architecture reduces the number of constant cosine
                 terms using the matrix transposition and symmetry
                 property. This in turn, considerably reduces the
                 computation time. The architecture is scalable and it
                 can be extended to support any transform length. The
                 architecture was validated on Xilinx Vertex-4 FPGA.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Tseng:2013:NNE,
  author =       "Kuo-Kun Tseng and Fu-Fu Zeng and Huang-Nan Huang and
                 Yiming Liu and Jeng-Shyang Pan and W. H. Ip and C. H.
                 Wu",
  title =        "A new non-exact {Aho--Corasick} framework for {ECG}
                 classification",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "2",
  pages =        "41--46",
  month =        may,
  year =         "2013",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2490302.2490310",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Sat Jun 1 11:00:26 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "The Aho--Corasick (AC) algorithm is a popular and
                 useful exact string matching algorithm for text
                 searching and deep packet inspection. However, it has
                 seldom been used for non-exact classification or
                 identification. We propose a novel framework to make
                 use of AC for non-exact matching in the ECG
                 identification. The AC classification (ACC) algorithm
                 converts ECG waveforms into several short patterns for
                 AC, and decides the identification result by AC matched
                 counting value. In our experiments, the results are
                 surprisingly good and superior to previous algorithms.
                 So, we designed an AC algorithm application for
                 non-exact classification with high accuracy. Meanwhile,
                 ACC inherits the advantage from AC of being capable of
                 handling a large pattern set with linear time
                 complexity.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Maitra:2013:HPM,
  author =       "Subhashis Maitra and Amitabha Sinha",
  title =        "High performance {MAC} unit for {DSP} and
                 cryptographic applications",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "2",
  pages =        "47--55",
  month =        may,
  year =         "2013",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2490302.2490311",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Sat Jun 1 11:00:26 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Multiplication and addition are the basic arithmetic
                 operation used in Digital Signal Processing (DSP) for
                 coefficient multiplication, scalar point multiplication
                 in Elliptic Curve Cryptography (ECC) and in other
                 fields. Multiplications are basically a shift and add
                 operation. However, there are many different variations
                 on how to do it. Some are more suitable to implement on
                 FPGA than others. However time complexities and
                 hardware complexities are the major issues in designing
                 a multiplier unit. There are different multiplication
                 algorithms in current technology. Hardware complexities
                 in some design are more than time complexities whereas
                 in some other design time complexities are more.
                 However there must be a tradeoff between these two
                 types of methodology. This paper will discuss a brief
                 idea how a tradeoff can be achieved. Experimental
                 results that have discussed here and the architecture
                 based on the proposed algorithm shows it's novelty.
                 Applications of the proposed algorithm on DSP and ECC
                 have been dealt here clearly.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Thorson:2013:INa,
  author =       "Mark Thorson",
  title =        "{Internet} nuggets",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "2",
  pages =        "56--71",
  month =        may,
  year =         "2013",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2490302.2490313",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Sat Jun 1 11:00:26 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Belhadj:2013:CRW,
  author =       "Bilel Belhadj and Antoine Joubert and Zheng Li and
                 Rodolphe H{\'e}liot and Olivier Temam",
  title =        "Continuous real-world inputs can open up alternative
                 accelerator designs",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "3",
  pages =        "1--12",
  month =        jun,
  year =         "2013",
  DOI =          "https://doi.org/10.1145/2508148.2485923",
  bibdate =      "Sat Jul 27 06:58:55 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ICSA '13 conference proceedings.",
  abstract =     "Motivated by energy constraints, future heterogeneous
                 multi-cores may contain a variety of accelerators, each
                 targeting a subset of the application spectrum. Beyond
                 energy, the growing number of faults steers accelerator
                 research towards fault-tolerant accelerators. In this
                 article, we investigate a fault-tolerant and
                 energy-efficient accelerator for signal processing
                 applications. We depart from traditional designs by
                 introducing an accelerator which relies on unary
                 coding, a concept which is well adapted to the
                 continuous real-world inputs of signal processing
                 applications. Unary coding enables a number of atypical
                 micro-architecture choices which bring down area cost
                 and energy; moreover, unary coding provides graceful
                 output degradation as the amount of transient faults
                 increases. We introduce a configurable hybrid
                 digital/analog micro-architecture capable of
                 implementing a broad set of signal processing
                 applications based on these concepts, together with a
                 back-end optimizer which takes advantage of the special
                 nature of these applications. For a set of five signal
                 applications, we explore the different design tradeoffs
                 and obtain an accelerator with an area cost of 1.63
                 mm$^2$. On average, this accelerator requires only
                 2.3\% of the energy of an Atom-like core to implement
                 similar tasks. We then evaluate the accelerator
                 resilience to transient faults, and its ability to
                 trade accuracy for energy savings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Petrica:2013:FDA,
  author =       "Paula Petrica and Adam M. Izraelevitz and David H.
                 Albonesi and Christine A. Shoemaker",
  title =        "{Flicker}: a dynamically adaptive architecture for
                 power limited multicore systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "3",
  pages =        "13--23",
  month =        jun,
  year =         "2013",
  DOI =          "https://doi.org/10.1145/2508148.2485924",
  bibdate =      "Sat Jul 27 06:58:55 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ICSA '13 conference proceedings.",
  abstract =     "Future microprocessors may become so power constrained
                 that not all transistors will be able to be powered on
                 at once. These systems will be required to nimbly adapt
                 to changes in the chip power that is allocated to
                 general-purpose cores and to specialized accelerators.
                 This paper presents Flicker, a general-purpose
                 multicore architecture that dynamically adapts to
                 varying and potentially stringent limits on allocated
                 power. The Flicker core microarchitecture includes
                 deconfigurable lanes --- horizontal slices through the
                 pipeline --- that permit tailoring an individual core
                 to the running application with lower overhead than
                 microarchitecture-level adaptation, and greater
                 flexibility than core-level power gating. To exploit
                 Flicker's flexible pipeline architecture, a new online
                 multicore optimization algorithm combines reduced
                 sampling techniques, application of response surface
                 models to online optimization, and heuristic online
                 search. The approach efficiently finds a
                 near-global-optimum configuration of lanes without
                 requiring offline training, microarchitecture state, or
                 foreknowledge of the workload. At high power
                 allocations, core-level gating is highly effective, and
                 slightly outperforms Flicker overall. However, under
                 stringent power constraints, Flicker significantly
                 outperforms core-level gating, achieving an average
                 27\% performance improvement.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Qadeer:2013:CEB,
  author =       "Wajahat Qadeer and Rehan Hameed and Ofer Shacham and
                 Preethi Venkatesan and Christos Kozyrakis and Mark A.
                 Horowitz",
  title =        "Convolution engine: balancing efficiency \&
                 flexibility in specialized computing",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "3",
  pages =        "24--35",
  month =        jun,
  year =         "2013",
  DOI =          "https://doi.org/10.1145/2508148.2485925",
  bibdate =      "Sat Jul 27 06:58:55 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ICSA '13 conference proceedings.",
  abstract =     "This paper focuses on the trade-off between
                 flexibility and efficiency in specialized computing. We
                 observe that specialized units achieve most of their
                 efficiency gains by tuning data storage and compute
                 structures and their connectivity to the data-flow and
                 data-locality patterns in the kernels. Hence, by
                 identifying key data-flow patterns used in a domain, we
                 can create efficient engines that can be programmed and
                 reused across a wide range of applications. We present
                 an example, the Convolution Engine (CE), specialized
                 for the convolution-like data-flow that is common in
                 computational photography, image processing, and video
                 processing applications. CE achieves energy efficiency
                 by capturing data reuse patterns, eliminating data
                 transfer overheads, and enabling a large number of
                 operations per memory access. We quantify the tradeoffs
                 in efficiency and flexibility and demonstrate that CE
                 is within a factor of 2-3x of the energy and area
                 efficiency of custom units optimized for a single
                 kernel. CE improves energy and area efficiency by 8-15x
                 over a SIMD engine for most applications.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Lim:2013:TSS,
  author =       "Kevin Lim and David Meisner and Ali G. Saidi and
                 Parthasarathy Ranganathan and Thomas F. Wenisch",
  title =        "Thin servers with smart pipes: designing {SoC}
                 accelerators for memcached",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "3",
  pages =        "36--47",
  month =        jun,
  year =         "2013",
  DOI =          "https://doi.org/10.1145/2508148.2485926",
  bibdate =      "Sat Jul 27 06:58:55 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ICSA '13 conference proceedings.",
  abstract =     "Distributed in-memory key-value stores, such as
                 memcached, are central to the scalability of modern
                 internet services. Current deployments use commodity
                 servers with high-end processors. However, given the
                 cost-sensitivity of internet services and the recent
                 proliferation of volume low-power System-on-Chip (SoC)
                 designs, we see an opportunity for alternative
                 architectures. We undertake a detailed characterization
                 of memcached to reveal performance and power
                 inefficiencies. Our study considers both
                 high-performance and low-power CPUs and NICs across a
                 variety of carefully-designed benchmarks that exercise
                 the range of memcached behavior. We discover that,
                 regardless of CPU microarchitecture, memcached
                 execution is remarkably inefficient, saturating neither
                 network links nor available memory bandwidth. Instead,
                 we find performance is typically limited by the
                 per-packet processing overheads in the NIC and OS
                 kernel --- long code paths limit CPU performance due to
                 poor branch predictability and instruction fetch
                 bottlenecks. Our insights suggest that neither
                 high-performance nor low-power cores provide a
                 satisfactory power-performance trade-off, and point to
                 a need for tighter integration of the network
                 interface. Hence, we argue for an alternate
                 architecture --- Thin Servers with Smart Pipes (TSSP)
                 --- for cost-effective high-performance memcached
                 deployment. TSSP couples an embedded-class low-power
                 core to a memcached accelerator that can process GET
                 requests entirely in hardware, offloading both network
                 handling and data look up. We demonstrate the potential
                 benefits of our TSSP architecture through an FPGA
                 prototyping platform, and show the potential for a
                 6x--16x power-performance improvement over conventional
                 server baselines.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Mukundan:2013:UMR,
  author =       "Janani Mukundan and Hillery Hunter and Kyu-hyoun Kim
                 and Jeffrey Stuecheli and Jos{\'e} F. Mart{\'\i}nez",
  title =        "Understanding and mitigating refresh overheads in
                 high-density {DDR4 DRAM} systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "3",
  pages =        "48--59",
  month =        jun,
  year =         "2013",
  DOI =          "https://doi.org/10.1145/2508148.2485927",
  bibdate =      "Sat Jul 27 06:58:55 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ICSA '13 conference proceedings.",
  abstract =     "Recent DRAM specifications exhibit increasing refresh
                 latencies. A refresh command blocks a full rank,
                 decreasing available parallelism in the memory
                 subsystem significantly, thus decreasing performance.
                 Fine Granularity Refresh (FGR) is a feature recently
                 announced as part of JEDEC's DDR4 DRAM specification
                 that attempts to tackle this problem by creating a
                 range of refresh options that provide a trade-off
                 between refresh latency and frequency. In this paper,
                 we first conduct an analysis of DDR4 DRAM's FGR
                 feature, and show that there is no one-size-fits-all
                 option across a variety of applications. We then
                 present Adaptive Refresh (AR), a simple yet effective
                 mechanism that dynamically chooses the best FGR mode
                 for each application and phase within the application.
                 When looking at the refresh problem more closely, we
                 identify in high-density DRAM systems a phenomenon that
                 we call command queue seizure, whereby the memory
                 controller's command queue seizes up temporarily
                 because it is full with commands to a rank that is
                 being refreshed. To attack this problem, we propose two
                 complementary mechanisms called Delayed Command
                 Expansion (DCE) and Preemptive Command Drain (PCD). Our
                 results show that AR does exploit DDR4's FGR
                 effectively. However, once our proposed DCE and PCD
                 mechanisms are added, DDR4's FGR becomes redundant in
                 most cases, except in a few highly memory-sensitive
                 applications, where the use of AR does provide some
                 additional benefit. In all, our simulations show that
                 the proposed mechanisms yield 8\% (14\%) mean speedup
                 with respect to traditional refresh, at normal
                 (extended) DRAM operating temperatures, for a set of
                 diverse parallel applications.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Liu:2013:ESD,
  author =       "Jamie Liu and Ben Jaiyen and Yoongu Kim and Chris
                 Wilkerson and Onur Mutlu",
  title =        "An experimental study of data retention behavior in
                 modern {DRAM} devices: implications for retention time
                 profiling mechanisms",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "3",
  pages =        "60--71",
  month =        jun,
  year =         "2013",
  DOI =          "https://doi.org/10.1145/2508148.2485928",
  bibdate =      "Sat Jul 27 06:58:55 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ICSA '13 conference proceedings.",
  abstract =     "DRAM cells store data in the form of charge on a
                 capacitor. This charge leaks off over time, eventually
                 causing data to be lost. To prevent this data loss from
                 occurring, DRAM cells must be periodically refreshed.
                 Unfortunately, DRAM refresh operations waste energy and
                 also degrade system performance by interfering with
                 memory requests. These problems are expected to worsen
                 as DRAM density increases. The amount of time that a
                 DRAM cell can safely retain data without being
                 refreshed is called the cell's retention time. In
                 current systems, all DRAM cells are refreshed at the
                 rate required to guarantee the integrity of the cell
                 with the shortest retention time, resulting in
                 unnecessary refreshes for cells with longer retention
                 times. Prior work has proposed to reduce unnecessary
                 refreshes by exploiting differences in retention time
                 among DRAM cells; however, such mechanisms require
                 knowledge of each cell's retention time. In this paper,
                 we present a comprehensive quantitative study of
                 retention behavior in modern DRAMs. Using a
                 temperature-controlled FPGA-based testing platform, we
                 collect retention time information from 248 commodity
                 DDR3 DRAM chips from five major DRAM vendors. We
                 observe two significant phenomena: data pattern
                 dependence, where the retention time of each DRAM cell
                 is significantly affected by the data stored in other
                 DRAM cells, and variable retention time, where the
                 retention time of some DRAM cells changes unpredictably
                 over time. We discuss possible physical explanations
                 for these phenomena, how their magnitude may be
                 affected by DRAM technology scaling, and their
                 ramifications for DRAM retention time profiling
                 mechanisms.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Nair:2013:AAF,
  author =       "Prashant J. Nair and Dae-Hyun Kim and Moinuddin K.
                 Qureshi",
  title =        "{ArchShield}: architectural framework for assisting
                 {DRAM} scaling by tolerating high error rates",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "3",
  pages =        "72--83",
  month =        jun,
  year =         "2013",
  DOI =          "https://doi.org/10.1145/2508148.2485929",
  bibdate =      "Sat Jul 27 06:58:55 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ICSA '13 conference proceedings.",
  abstract =     "DRAM scaling has been the prime driver for increasing
                 the capacity of main memory system over the past three
                 decades. Unfortunately, scaling DRAM to smaller
                 technology nodes has become challenging due to the
                 inherent difficulty in designing smaller geometries,
                 coupled with the problems of device variation and
                 leakage. Future DRAM devices are likely to experience
                 significantly high error-rates. Techniques that can
                 tolerate errors efficiently can enable DRAM to scale to
                 smaller technology nodes. However, existing techniques
                 such as row/column sparing and ECC become prohibitive
                 at high error-rates. To develop cost-effective
                 solutions for tolerating high error-rates, this paper
                 advocates a cross-layer approach. Rather than hiding
                 the faulty cell information within the DRAM chips, we
                 expose it to the architectural level. We propose
                 ArchShield, an architectural framework that employs
                 runtime testing to identify faulty DRAM cells.
                 ArchShield tolerates these faults using two components,
                 a Fault Map that keeps information about faulty words
                 in a cache line, and Selective Word-Level Replication
                 (SWLR) that replicates faulty words for error
                 resilience. Both Fault Map and SWLR are integrated in
                 reserved area in DRAM memory. Our evaluations with 8GB
                 DRAM DIMM show that ArchShield can efficiently tolerate
                 error-rates as higher as 10$^{-4}$ (100x higher than
                 ECC alone), causes less than 2\% performance
                 degradation, and still maintains 1-bit error tolerance
                 against soft errors.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ghose:2013:IMS,
  author =       "Saugata Ghose and Hyodong Lee and Jos{\'e} F.
                 Mart{\'\i}nez",
  title =        "Improving memory scheduling via processor-side load
                 criticality information",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "3",
  pages =        "84--95",
  month =        jun,
  year =         "2013",
  DOI =          "https://doi.org/10.1145/2508148.2485930",
  bibdate =      "Sat Jul 27 06:58:55 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ICSA '13 conference proceedings.",
  abstract =     "We hypothesize that performing processor-side analysis
                 of load instructions, and providing this pre-digested
                 information to memory schedulers judiciously, can
                 increase the sophistication of memory decisions while
                 maintaining a lean memory controller that can take
                 scheduling actions quickly. This is increasingly
                 important as DRAM frequencies continue to increase
                 relative to processor speed. In this paper we propose
                 one such mechanism, pairing up a processor-side load
                 criticality predictor with a lean memory controller
                 that prioritizes load requests based on ranking
                 information supplied from the processor side. Using a
                 sophisticated multi-core simulator that includes a
                 detailed quad-channel DDR3 DRAM model, we demonstrate
                 that this mechanism can improve performance
                 significantly on a CMP, with minimal overhead and
                 virtually no changes to the processor itself. We show
                 that our design compares favorably to several
                 state-of-the-art schedulers.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Isci:2013:AEV,
  author =       "Canturk Isci and Suzanne McIntosh and Jeffrey Kephart
                 and Rajarshi Das and James Hanson and Scott Piper and
                 Robert Wolford and Thomas Brey and Robert Kantner and
                 Allen Ng and James Norris and Abdoulaye Traore and
                 Michael Frissora",
  title =        "Agile, efficient virtualization power management with
                 low-latency server power states",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "3",
  pages =        "96--107",
  month =        jun,
  year =         "2013",
  DOI =          "https://doi.org/10.1145/2508148.2485931",
  bibdate =      "Sat Jul 27 06:58:55 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  note =         "ICSA '13 conference proceedings.",
  abstract =     "One of the main driving forces of the growing adoption
                 of virtualization is its dramatic simplification of the
                 provisioning and dynamic management of IT resources. By
                 decoupling running entities from the underlying
                 physical resources, and by providing easy-to-use
                 controls to allocate, deallocate and migrate virtual
                 machines (VMs) across physical boundaries,
                 virtualization opens up new opportunities for improving
                 overall system resource use and power efficiency. While
                 a range of techniques for dynamic, distributed resource
                 management of virtualized systems have been proposed
                 and have seen their widespread adoption in enterprise
                 systems, similar techniques for dynamic power
                 management have seen limited acceptance. The main
                 barrier to dynamic, power-aware virtualization
                 management stems not from the limitations of
                 virtualization, but rather from the underlying physical
                 systems; and in particular, the high latency and energy
                 cost of power state change actions suited for
                 virtualization power management. In this work, we first
                 explore the feasibility of low-latency power states for
                 enterprise server systems and demonstrate, with real
                 prototypes, their quantitative energy-performance trade
                 offs compared to traditional server power states. Then,
                 we demonstrate an end-to-end power-aware virtualization
                 management solution leveraging these states, and
                 evaluate the dramatically-favorable power-performance
                 characteristics achievable with such systems. We
                 present, via both real system implementations and
                 scale-out simulations, that virtualization power
                 management with low-latency server power states can
                 achieve comparable overheads as base distributed
                 resource management in virtualized systems, and thus
                 can benefit from the same level of adoption, while
                 delivering close to energy-proportional power
                 efficiency.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Tu:2013:SDS,
  author =       "Cheng-Chun Tu and Chao-tang Lee and Tzi-cker Chiueh",
  title =        "Secure {I/O} device sharing among virtual machines on
                 multiple hosts",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "3",
  pages =        "108--119",
  month =        jun,
  year =         "2013",
  DOI =          "https://doi.org/10.1145/2508148.2485932",
  bibdate =      "Sat Jul 27 06:58:55 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  note =         "ICSA '13 conference proceedings.",
  abstract =     "Virtualization allows flexible mappings between
                 physical resources and virtual entities, and improves
                 allocation efficiency and agility. Unfortunately, most
                 existing virtualization technologies are limited to
                 resources in a single host. This paper presents the
                 design, implementation and evaluation of a multi-host
                 I/O device virtualization system called Ladon, which
                 enables I/O devices to be shared among virtual machines
                 running on multiple hosts in a secure and efficient
                 way. Specifically, Ladon uses a PCIe network to connect
                 multiple servers with PCIe devices and allows VMs
                 running on these servers to directly interact with
                 these PCIe devices without interfering with one
                 another. Through an evaluation of a fully operational
                 Ladon prototype, we show that there is no throughput
                 and latency penalty of the multi-host I/O
                 virtualization enabled by Ladon compared to those of
                 the existing single-host I/O virtualization
                 technology.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Chang:2013:IVP,
  author =       "Xiaotao Chang and Hubertus Franke and Yi Ge and Tao
                 Liu and Kun Wang and Jimi Xenidis and Fei Chen and Yu
                 Zhang",
  title =        "Improving virtualization in the presence of software
                 managed translation lookaside buffers",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "3",
  pages =        "120--129",
  month =        jun,
  year =         "2013",
  DOI =          "https://doi.org/10.1145/2508148.2485933",
  bibdate =      "Sat Jul 27 06:58:55 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  note =         "ICSA '13 conference proceedings.",
  abstract =     "Virtualization has become an important technology that
                 is used across many platforms, particularly servers, to
                 increase utilization, multi-tenancy and security.
                 Virtualization introduces additional overhead that
                 often relates to memory management, interrupt handling
                 and hypervisor mode switching. Among those, memory
                 management and translation lookaside buffer (TLB)
                 management have been shown to have a significant impact
                 on the performance of systems. Two principal mechanisms
                 for TLB management exist in today's systems, namely
                 software and hardware managed TLBs. In this paper, we
                 analyze and quantify the overhead of a pure software
                 virtualization that is implemented over a software
                 managed TLB. We then describe our design of hardware
                 extensions to support virtualization in systems with
                 software managed TLBs to remove the most dominant
                 overheads. These extensions were implemented in the
                 Power embedded A2 core, which is used in the PowerEN
                 and in the Blue Gene/Q processors. They were used to
                 implement a KVM port. We evaluate each of these
                 hardware extensions to determine their overall
                 contributions to performance and efficiency.
                 Collectively these extensions demonstrate an average
                 improvement of 232\% over a pure software
                 implementation.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kim:2013:MME,
  author =       "Ji Kim and Christopher Torng and Shreesha Srinath and
                 Derek Lockhart and Christopher Batten",
  title =        "Microarchitectural mechanisms to exploit value
                 structure in {SIMT} architectures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "3",
  pages =        "130--141",
  month =        jun,
  year =         "2013",
  DOI =          "https://doi.org/10.1145/2508148.2485934",
  bibdate =      "Sat Jul 27 06:58:55 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ICSA '13 conference proceedings.",
  abstract =     "SIMT architectures improve performance and efficiency
                 by exploiting control and memory-access structure
                 across data-parallel threads. Value structure occurs
                 when multiple threads operate on values that can be
                 compactly encoded, e.g., by using a simple function of
                 the thread index. We characterize the availability of
                 control, memory-access, and value structure in typical
                 kernels and observe ample amounts of value structure
                 that is largely ignored by current SIMT architectures.
                 We propose three microarchitectural mechanisms to
                 exploit value structure based on compact affine
                 execution of arithmetic, branch, and memory
                 instructions. We explore these mechanisms within the
                 context of traditional SIMT microarchitectures
                 (GP-SIMT), found in general-purpose graphics processing
                 units, as well as fine-grain SIMT microarchitectures
                 (FG-SIMT), a SIMT variant appropriate for
                 compute-focused data-parallel accelerators. Cycle-level
                 modeling of a modern GP-SIMT system and a VLSI
                 implementation of an eight-lane FG-SIMT execution
                 engine are used to evaluate a range of application
                 kernels. When compared to a baseline without compact
                 affine execution, our approach can improve GP-SIMT
                 cycle-level performance by 4-17\% and can improve
                 FG-SIMT absolute performance by 20-65\% and energy
                 efficiency up to 30\% for a majority of the kernels.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Parashar:2013:TIC,
  author =       "Angshuman Parashar and Michael Pellauer and Michael
                 Adler and Bushra Ahsan and Neal Crago and Daniel Lustig
                 and Vladimir Pavlov and Antonia Zhai and Mohit Gambhir
                 and Aamer Jaleel and Randy Allmon and Rachid Rayess and
                 Stephen Maresh and Joel Emer",
  title =        "Triggered instructions: a control paradigm for
                 spatially-programmed architectures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "3",
  pages =        "142--153",
  month =        jun,
  year =         "2013",
  DOI =          "https://doi.org/10.1145/2508148.2485935",
  bibdate =      "Sat Jul 27 06:58:55 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ICSA '13 conference proceedings.",
  abstract =     "In this paper, we present triggered instructions, a
                 novel control paradigm for arrays of processing
                 elements (PEs) aimed at exploiting spatial parallelism.
                 Triggered instructions completely eliminate the program
                 counter and allow programs to transition concisely
                 between states without explicit branch instructions.
                 They also allow efficient reactivity to inter-PE
                 communication traffic. The approach provides a unified
                 mechanism to avoid over-serialized execution,
                 essentially achieving the effect of techniques such as
                 dynamic instruction reordering and multithreading,
                 which each require distinct hardware mechanisms in a
                 traditional sequential architecture. Our analysis shows
                 that a triggered-instruction based spatial accelerator
                 can achieve 8X greater area-normalized performance than
                 a traditional general-purpose processor. Further
                 analysis shows that triggered control reduces the
                 number of static and dynamic instructions in the
                 critical paths by 62\% and 64\% respectively over a
                 program-counter style spatial baseline, resulting in a
                 speedup of 2.0X.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Joao:2013:UBA,
  author =       "Jos{\'e} A. Joao and M. Aater Suleman and Onur Mutlu
                 and Yale N. Patt",
  title =        "Utility-based acceleration of multithreaded
                 applications on asymmetric {CMPs}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "3",
  pages =        "154--165",
  month =        jun,
  year =         "2013",
  DOI =          "https://doi.org/10.1145/2508148.2485936",
  bibdate =      "Sat Jul 27 06:58:55 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ICSA '13 conference proceedings.",
  abstract =     "Asymmetric Chip Multiprocessors (ACMPs) are becoming a
                 reality. ACMPs can speed up parallel applications if
                 they can identify and accelerate code segments that are
                 critical for performance. Proposals already exist for
                 using coarse-grained thread scheduling and fine-grained
                 bottleneck acceleration. Unfortunately, there have been
                 no proposals offered thus far to decide which code
                 segments to accelerate in cases where both
                 coarse-grained thread scheduling and fine-grained
                 bottleneck acceleration could have value. This paper
                 proposes Utility-Based Acceleration of Multithreaded
                 Applications on Asymmetric CMPs (UBA), a cooperative
                 software/hardware mechanism for identifying and
                 accelerating the most likely critical code segments
                 from a set of multithreaded applications running on an
                 ACMP. The key idea is a new Utility of Acceleration
                 metric that quantifies the performance benefit of
                 accelerating a bottleneck or a thread by taking into
                 account both the criticality and the expected speedup.
                 UBA outperforms the best of two state-of-the-art
                 mechanisms by 11\% for single application workloads and
                 by 7\% for two-application workloads on an ACMP with 52
                 small cores and 3 large cores.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kudrow:2013:QRC,
  author =       "Daniel Kudrow and Kenneth Bier and Zhaoxia Deng and
                 Diana Franklin and Yu Tomita and Kenneth R. Brown and
                 Frederic T. Chong",
  title =        "Quantum rotations: a case study in static and dynamic
                 machine-code generation for quantum computers",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "3",
  pages =        "166--176",
  month =        jun,
  year =         "2013",
  DOI =          "https://doi.org/10.1145/2508148.2485937",
  bibdate =      "Sat Jul 27 06:58:55 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ICSA '13 conference proceedings.",
  abstract =     "Work in quantum computer architecture has focused on
                 communication, layout and fault tolerance, largely
                 driven by Shor's factorization algorithm. For the first
                 time, we study a larger range of benchmarks and find
                 that another critical issue is the generation of code
                 sequences for quantum rotation operations.
                 Specifically, quantum algorithms require arbitrary
                 rotation angles, while quantum technologies and error
                 correction codes provide only for discrete angles and
                 operators. A sequence of quantum machine instructions
                 must be generated to approximate the arbitrary rotation
                 to the required precision. While previous work has
                 focused exclusively on static compilation, we find that
                 some applications require dynamic code generation and
                 explore the advantages and disadvantages of static and
                 dynamic approaches. We find that static code generation
                 can, in some cases, lead to a terabyte of machine code
                 to support required rotations. We also find that some
                 rotation angles are unknown until run time, requiring
                 dynamic code generation. Dynamic code generation,
                 however, exhibits significant trade-offs in terms of
                 time overhead versus code size. Furthermore, dynamic
                 code generation will be performed on classical
                 (non-quantum) computing resources, which may or may not
                 have a clock speed advantage over the target quantum
                 technology. For example, operations on trapped ions run
                 at kilohertz speeds, but superconducting qubits run at
                 gigahertz speeds. We introduce a new method for
                 compiling arbitrary rotations dynamically, designed to
                 minimize compilation time. The new method reduces
                 compilation time by up to five orders of magnitude
                 while increasing code size by one order of magnitude.
                 We explore the design space formed by these trade-offs
                 of dynamic versus static code generation, code quality,
                 and quantum technology. We introduce several techniques
                 to provide smoother trade-offs for dynamic code
                 generation and evaluate the viability of options in the
                 design space.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Muscat:2013:DBM,
  author =       "Richard A. Muscat and Karin Strauss and Luis Ceze and
                 Georg Seelig",
  title =        "{DNA}-based molecular architecture with spatially
                 localized components",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "3",
  pages =        "177--188",
  month =        jun,
  year =         "2013",
  DOI =          "https://doi.org/10.1145/2508148.2485938",
  bibdate =      "Sat Jul 27 06:58:55 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ICSA '13 conference proceedings.",
  abstract =     "Performing computation inside living cells offers
                 life-changing applications, from improved medical
                 diagnostics to better cancer therapy to intelligent
                 drugs. Due to its bio-compatibility and ease of
                 engineering, one promising approach for performing
                 in-vivo computation is DNA strand displacement. This
                 paper introduces computer architects to DNA strand
                 displacement ``circuits'', discusses associated
                 architectural challenges, and proposes a new
                 organization that provides practical composability. In
                 particular, prior approaches rely mostly on stochastic
                 interaction of freely diffusing components. This paper
                 proposes practical spatial isolation of components,
                 leading to more easily designed DNA-based circuits. DNA
                 nanotechnology is currently at a turning point, with
                 many proposed applications being realized [20, 9]. We
                 believe that it is time for the computer architecture
                 community to take notice and contribute.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Guo:2013:ADA,
  author =       "Qing Guo and Xiaochen Guo and Ravi Patel and Engin
                 Ipek and Eby G. Friedman",
  title =        "{AC-DIMM}: associative computing with {STT-MRAM}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "3",
  pages =        "189--200",
  month =        jun,
  year =         "2013",
  DOI =          "https://doi.org/10.1145/2508148.2485939",
  bibdate =      "Sat Jul 27 06:58:55 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ICSA '13 conference proceedings.",
  abstract =     "With technology scaling, on-chip power dissipation and
                 off-chip memory bandwidth have become significant
                 performance bottlenecks in virtually all computer
                 systems, from mobile devices to supercomputers. An
                 effective way of improving performance in the face of
                 bandwidth and power limitations is to rely on
                 associative memory systems. Recent work on a PCM-based,
                 associative TCAM accelerator shows that associative
                 search capability can reduce both off-chip bandwidth
                 demand and overall system energy. Unfortunately,
                 previously proposed resistive TCAM accelerators have
                 limited flexibility: only a restricted (albeit
                 important) class of applications can benefit from a
                 TCAM accelerator, and the implementation is confined to
                 resistive memory technologies with a high dynamic range
                 ( {R$_{High}$} /{R$_{Low}$} ), such as PCM. This work
                 proposes AC-DIMM, a flexible, high-performance
                 associative compute engine built on a DDR3-compatible
                 memory module. AC-DIMM addresses the limited
                 flexibility of previous resistive TCAM accelerators by
                 combining two powerful capabilities --- associative
                 search and processing in memory. Generality is improved
                 by augmenting a TCAM system with a set of integrated,
                 user programmable microcontrollers that operate
                 directly on search results, and by architecting the
                 system such that key-value pairs can be co-located in
                 the same TCAM row. A new, bit-serial TCAM array is
                 proposed, which enables the system to be implemented
                 using STT-MRAM. AC-DIMM achieves a 4.2X speedup and a
                 6.5X energy reduction over a conventional RAM-based
                 system on a set of 13 evaluated applications.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hechtman:2013:EMC,
  author =       "Blake A. Hechtman and Daniel J. Sorin",
  title =        "Exploring memory consistency for massively-threaded
                 throughput-oriented processors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "3",
  pages =        "201--212",
  month =        jun,
  year =         "2013",
  DOI =          "https://doi.org/10.1145/2508148.2485940",
  bibdate =      "Sat Jul 27 06:58:55 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ICSA '13 conference proceedings.",
  abstract =     "We re-visit the issue of hardware consistency models
                 in the new context of massively-threaded
                 throughput-oriented processors (MTTOPs). A prominent
                 example of an MTTOP is a GPGPU, but other examples
                 include Intel's MIC architecture and some recent
                 academic designs. MTTOPs differ from CPUs in many
                 significant ways, including their ability to tolerate
                 latency, their memory system organization, and the
                 characteristics of the software they run. We compare
                 implementations of various hardware consistency models
                 for MTTOPs in terms of performance, energy-efficiency,
                 hardware complexity, and programmability. Our results
                 show that the choice of hardware consistency model has
                 a surprisingly minimal impact on performance and thus
                 the decision should be based on hardware complexity,
                 energy-efficiency, and programmability. For many
                 MTTOPs, it is likely that even a simple implementation
                 of sequential consistency is attractive.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Duan:2013:WTM,
  author =       "Yuelu Duan and Abdullah Muzahid and Josep Torrellas",
  title =        "{WeeFence}: toward making fences free in {TSO}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "3",
  pages =        "213--224",
  month =        jun,
  year =         "2013",
  DOI =          "https://doi.org/10.1145/2508148.2485941",
  bibdate =      "Sat Jul 27 06:58:55 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ICSA '13 conference proceedings.",
  abstract =     "Although fences are designed for low-overhead
                 concurrency coordination, they can be expensive in
                 current machines. If fences were largely free, faster
                 fine-grained concurrent algorithms could be devised,
                 and compilers could guarantee Sequential Consistency
                 (SC) at little cost. In this paper, we present WeeFence
                 (or WFence for short), a fence that is very cheap
                 because it allows post-fence accesses to skip it. Such
                 accesses can typically complete and retire before the
                 pre-fence writes have drained from the write buffer.
                 Only when an incorrect reordering of accesses is about
                 to happen, does the hardware stall to prevent it. In
                 the paper, we present the WFence design for TSO, and
                 compare it to a conventional fence with speculation for
                 8-processor multicore simulations. We run parallel
                 kernels that contain explicit fences and parallel
                 applications that do not. For the kernels, WFence
                 eliminates nearly all of the fence stall, reducing the
                 kernels' execution time by an average of 11\%. For the
                 applications, a conservative compiler algorithm places
                 fences in the code to guarantee SC. In this case, on
                 average, WFences reduce the resulting fence overhead
                 from 38\% of the applications' execution time to 2\%
                 (in a centralized WFence design), or from 36\% to 5\%
                 (in a distributed WFence design).",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Cain:2013:RAS,
  author =       "Harold W. Cain and Maged M. Michael and Brad Frey and
                 Cathy May and Derek Williams and Hung Le",
  title =        "Robust architectural support for transactional memory
                 in the {Power} architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "3",
  pages =        "225--236",
  month =        jun,
  year =         "2013",
  DOI =          "https://doi.org/10.1145/2508148.2485942",
  bibdate =      "Sat Jul 27 06:58:55 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ICSA '13 conference proceedings.",
  abstract =     "On the twentieth anniversary of the original
                 publication [10], following ten years of intense
                 activity in the research literature, hardware support
                 for transactional memory (TM) has finally become a
                 commercial reality, with HTM-enabled chips currently or
                 soon-to-be available from many hardware vendors. In
                 this paper we describe architectural support for TM
                 added to a future version of the Power ISA{\TM}. Two
                 imperatives drove the development: the desire to
                 complement our weakly-consistent memory model with a
                 more friendly interface to simplify the development and
                 porting of multithreaded applications, and the need for
                 robustness beyond that of some early implementations.
                 In the process of commercializing the feature, we had
                 to resolve some previously unexplored interactions
                 between TM and existing features of the ISA, for
                 example translation shootdown, interrupt handling,
                 atomic read-modify-write primitives, and our weakly
                 consistent memory model. We describe these
                 interactions, the overall architecture, and discuss the
                 motivation and rationale for our choices of
                 architectural semantics, beyond what is typically found
                 in reference manuals.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Basu:2013:EVM,
  author =       "Arkaprava Basu and Jayneel Gandhi and Jichuan Chang
                 and Mark D. Hill and Michael M. Swift",
  title =        "Efficient virtual memory for big memory servers",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "3",
  pages =        "237--248",
  month =        jun,
  year =         "2013",
  DOI =          "https://doi.org/10.1145/2508148.2485943",
  bibdate =      "Sat Jul 27 06:58:55 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ICSA '13 conference proceedings.",
  abstract =     "Our analysis shows that many ``big-memory'' server
                 workloads, such as databases, in-memory caches, and
                 graph analytics, pay a high cost for page-based virtual
                 memory. They consume as much as 10\% of execution
                 cycles on TLB misses, even using large pages. On the
                 other hand, we find that these workloads use read-write
                 permission on most pages, are provisioned not to swap,
                 and rarely benefit from the full flexibility of
                 page-based virtual memory. To remove the TLB miss
                 overhead for big-memory workloads, we propose mapping
                 part of a process's linear virtual address space with a
                 direct segment, while page mapping the rest of the
                 virtual address space. Direct segments use minimal
                 hardware --- base, limit and offset registers per core
                 --- to map contiguous virtual memory regions directly
                 to contiguous physical memory. They eliminate the
                 possibility of TLB misses for key data structures such
                 as database buffer pools and in-memory key-value
                 stores. Memory mapped by a direct segment may be
                 converted back to paging when needed. We prototype
                 direct-segment software support for x86-64 in Linux and
                 emulate direct-segment hardware. For our workloads,
                 direct segments eliminate almost all TLB misses and
                 reduce the execution time wasted on TLB misses to less
                 than 0.5\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Wu:2013:NBD,
  author =       "Lisa Wu and Raymond J. Barker and Martha A. Kim and
                 Kenneth A. Ross",
  title =        "Navigating big data with high-throughput,
                 energy-efficient data partitioning",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "3",
  pages =        "249--260",
  month =        jun,
  year =         "2013",
  DOI =          "https://doi.org/10.1145/2508148.2485944",
  bibdate =      "Sat Jul 27 06:58:55 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ICSA '13 conference proceedings.",
  abstract =     "The global pool of data is growing at 2.5 quintillion
                 bytes per day, with 90\% of it produced in the last two
                 years alone [24]. There is no doubt the era of big data
                 has arrived. This paper explores targeted deployment of
                 hardware accelerators to improve the throughput and
                 energy efficiency of large-scale data processing. In
                 particular, data partitioning is a critical operation
                 for manipulating large data sets. It is often the
                 limiting factor in database performance and represents
                 a significant fraction of the overall runtime of large
                 data queries. To accelerate partitioning, this paper
                 describes a hardware accelerator for range
                 partitioning, or HARP, and a hardware-software data
                 streaming framework. The streaming framework offers a
                 seamless execution environment for streaming
                 accelerators such as HARP. Together, HARP and the
                 streaming framework provide an order of magnitude
                 improvement in partitioning performance and energy. A
                 detailed analysis of a 32 nm physical design shows 7.8
                 times the throughput of a highly optimized and
                 optimistic software implementation, while consuming
                 just 6.9\% of the area and 4.3\% of the power of a
                 single Xeon core in the same technology generation.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Chung:2013:LBD,
  author =       "Eric S. Chung and John D. Davis and Jaewon Lee",
  title =        "{LINQits}: big data on little clients",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "3",
  pages =        "261--272",
  month =        jun,
  year =         "2013",
  DOI =          "https://doi.org/10.1145/2508148.2485945",
  bibdate =      "Sat Jul 27 06:58:55 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ICSA '13 conference proceedings.",
  abstract =     "We present LINQits, a flexible hardware template that
                 can be mapped onto programmable logic or ASICs in a
                 heterogeneous system-on-chip for a mobile device or
                 server. Unlike fixed-function accelerators, LINQits
                 accelerates a domain-specific query language called
                 LINQ. LINQits does not provide coverage for all
                 possible applications --- however, existing
                 applications (re-)written with LINQ in mind benefit
                 extensively from hardware acceleration. Furthermore,
                 the LINQits framework offers a graceful and transparent
                 migration path from software to hardware. LINQits is
                 prototyped on a 2W heterogeneous SoC called the ZYNQ
                 processor, which combines dual ARM A9 processors with
                 an FPGA on a single die in 28nm silicon technology. Our
                 physical measurements show that LINQits improves energy
                 efficiency by 8.9 to 30.6 times and performance by 10.7
                 to 38.1 times compared to optimized, multithreaded C
                 programs running on conventional ARM A9 processors.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Atta:2013:SBI,
  author =       "Islam Atta and Pinar T{\"o}z{\"u}n and Xin Tong and
                 Anastasia Ailamaki and Andreas Moshovos",
  title =        "{STREX}: boosting instruction cache reuse in {OLTP}
                 workloads through stratified transaction execution",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "3",
  pages =        "273--284",
  month =        jun,
  year =         "2013",
  DOI =          "https://doi.org/10.1145/2508148.2485946",
  bibdate =      "Sat Jul 27 06:58:55 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ICSA '13 conference proceedings.",
  abstract =     "Online transaction processing (OLTP) workload
                 performance suffers from instruction stalls; the
                 instruction footprint of a typical transaction exceeds
                 by far the capacity of an L1 cache, leading to ongoing
                 cache thrashing. Several proposed techniques remove
                 some instruction stalls in exchange for error-prone
                 instrumentation to the code base, or a sharp increase
                 in the L1-I cache unit area and power. Others reduce
                 instruction miss latency by better utilizing a shared
                 L2 cache. SLICC [2], a recently proposed thread
                 migration technique that exploits transaction
                 instruction locality, is promising for high core counts
                 but performs sub-optimally or may hurt performance when
                 running on few cores. This paper corroborates that OLTP
                 transactions exhibit significant intra- and
                 inter-thread overlap in their instruction footprint,
                 and analyzes the instruction stall reduction benefits.
                 This paper presents STREX, a hardware,
                 programmer-transparent technique that exploits typical
                 transaction behavior to improve instruction reuse in
                 first level caches. STREX time-multiplexes the
                 execution of similar transactions dynamically on a
                 single core so that instructions fetched by one
                 transaction are reused by all other transactions
                 executing in the system as much as possible. STREX
                 dynamically slices the execution of each transaction
                 into cache-sized segments simply by observing when
                 blocks are brought in the cache and when they are
                 evicted. Experiments show that, when compared to
                 baseline execution on 2--16 cores, STREX consistently
                 improves performance while reducing the number of L1
                 instruction and data misses by 37\% and 14\% on
                 average, respectively. Finally, this paper proposes a
                 practical hybrid technique that combines STREX and
                 SLICC, thereby guaranteeing performance benefits
                 regardless of the number of available cores and the
                 workload's footprint.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Paul:2013:CBN,
  author =       "Indrani Paul and Srilatha Manne and Manish Arora and
                 W. Lloyd Bircher and Sudhakar Yalamanchili",
  title =        "Cooperative boosting: needy versus greedy power
                 management",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "3",
  pages =        "285--296",
  month =        jun,
  year =         "2013",
  DOI =          "https://doi.org/10.1145/2508148.2485947",
  bibdate =      "Sat Jul 27 06:58:55 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ICSA '13 conference proceedings.",
  abstract =     "This paper examines the interaction between thermal
                 management techniques and power boosting in a
                 state-of-the-art heterogeneous processor consisting of
                 a set of CPU and GPU cores. We show that for classes of
                 applications that utilize both the CPU and the GPU,
                 modern boost algorithms that greedily seek to convert
                 thermal headroom into performance can interact with
                 thermal coupling effects between the CPU and the GPU to
                 degrade performance. We first examine the causes of
                 this behavior and explain the interaction between
                 thermal coupling, performance coupling, and workload
                 behavior. Then we propose a dynamic power-management
                 approach called cooperative boosting (CB) to allocate
                 power dynamically between CPU and GPU in a manner that
                 balances thermal coupling against the needs of
                 performance coupling to optimize performance under a
                 given thermal constraint. Through real hardware-based
                 measurements, we evaluate CB against a
                 state-of-the-practice boost algorithm and show that
                 overall application performance and power savings
                 increase by 10\% and 8\% (up to 52\% and 34\%),
                 respectively, resulting in average energy efficiency
                 improvement of 25\% (up to 76\%) over a wide range of
                 benchmarks.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Bacha:2013:DRV,
  author =       "Anys Bacha and Radu Teodorescu",
  title =        "Dynamic reduction of voltage margins by leveraging
                 on-chip {ECC} in {Itanium II} processors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "3",
  pages =        "297--307",
  month =        jun,
  year =         "2013",
  DOI =          "https://doi.org/10.1145/2508148.2485948",
  bibdate =      "Sat Jul 27 06:58:55 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ICSA '13 conference proceedings.",
  abstract =     "Lowering supply voltage is one of the most effective
                 approaches for improving the energy efficiency of
                 microprocessors. Unfortunately, technology limitations,
                 such as process variability and circuit aging, are
                 forcing microprocessor designers to add larger voltage
                 guardbands to their chips. This makes supply voltage
                 increasingly difficult to scale with technology. This
                 paper presents a new mechanism for dynamically reducing
                 voltage margins while maintaining the chip operating
                 frequency constant. Unlike previous approaches that
                 rely on special hardware to detect and recover from
                 timing violations caused by low-voltage execution, our
                 solution is firmware-based and does not require
                 additional hardware. Instead, it relies on error
                 correction mechanisms already built into modern
                 processors. The system dynamically reduces voltage
                 margins and uses correctable error reports raised by
                 the hardware to identify the lowest, safe operating
                 voltage. The solution adapts to core-to-core
                 variability by tailoring supply voltage to each core's
                 safe operating level. In addition, it exploits
                 variability in workload vulnerability to low voltage
                 execution. The system was prototyped on an HP Integrity
                 Server that uses Intel's Itanium 9560 processors.
                 Evaluation using SPECjbb2005 and SPEC CPU2000 workloads
                 shows core power savings ranging from 18\% to 23\%,
                 with minimal performance impact.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Cook:2013:HEC,
  author =       "Henry Cook and Miquel Moreto and Sarah Bird and Khanh
                 Dao and David A. Patterson and Krste Asanovic",
  title =        "A hardware evaluation of cache partitioning to improve
                 utilization and energy-efficiency while preserving
                 responsiveness",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "3",
  pages =        "308--319",
  month =        jun,
  year =         "2013",
  DOI =          "https://doi.org/10.1145/2508148.2485949",
  bibdate =      "Sat Jul 27 06:58:55 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ICSA '13 conference proceedings.",
  abstract =     "Computing workloads often contain a mix of
                 interactive, latency-sensitive foreground applications
                 and recurring background computations. To guarantee
                 responsiveness, interactive and batch applications are
                 often run on disjoint sets of resources, but this
                 incurs additional energy, power, and capital costs. In
                 this paper, we evaluate the potential of hardware cache
                 partitioning mechanisms and policies to improve
                 efficiency by allowing background applications to run
                 simultaneously with interactive foreground
                 applications, while avoiding degradation in interactive
                 responsiveness. We evaluate these tradeoffs using
                 commercial x86 multicore hardware that supports cache
                 partitioning, and find that real hardware measurements
                 with full applications provide different observations
                 than past simulation-based evaluations. Co-scheduling
                 applications without LLC partitioning leads to a 10\%
                 energy improvement and average throughput improvement
                 of 54\% compared to running tasks separately, but can
                 result in foreground performance degradation of up to
                 34\% with an average of 6\%. With optimal static LLC
                 partitioning, the average energy improvement increases
                 to 12\% and the average throughput improvement to 60\%,
                 while the worst case slowdown is reduced noticeably to
                 7\% with an average slowdown of only 2\%. We also
                 evaluate a practical low-overhead dynamic algorithm to
                 control partition sizes, and are able to realize the
                 potential performance guarantees of the optimal static
                 approach, while increasing background throughput by an
                 additional 19\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Das:2013:CEP,
  author =       "Reetuparna Das and Satish Narayanasamy and Sudhir K.
                 Satpathy and Ronald G. Dreslinski",
  title =        "{Catnap}: energy proportional multiple
                 network-on-chip",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "3",
  pages =        "320--331",
  month =        jun,
  year =         "2013",
  DOI =          "https://doi.org/10.1145/2508148.2485950",
  bibdate =      "Sat Jul 27 06:58:55 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ICSA '13 conference proceedings.",
  abstract =     "Multiple networks have been used in several processor
                 implementations to scale bandwidth and ensure
                 protocol-level deadlock freedom for different message
                 classes. In this paper, we observe that a
                 multiple-network design is also attractive from a power
                 perspective and can be leveraged to achieve energy
                 proportionality by effective power gating. Unlike a
                 single-network design, a multiple-network design is
                 more amenable to power gating, as its subnetworks
                 (subnets) can be power gated without compromising the
                 connectivity of the network. To exploit this
                 opportunity, we propose the Catnap architecture which
                 consists of synergistic subnet selection and
                 power-gating policies. Catnap maximizes the number of
                 consecutive idle cycles in a router, while avoiding
                 performance loss due to overloading a subnet. We
                 evaluate a 256-core processor with a concentrated mesh
                 topology using synthetic traffic and 35 applications.
                 We show that the average network power of a
                 power-gating optimized multiple-network design with
                 four subnets could be 44\% lower than a bandwidth
                 equivalent single-network design for an average
                 performance cost of about 5\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Jog:2013:OSP,
  author =       "Adwait Jog and Onur Kayiran and Asit K. Mishra and
                 Mahmut T. Kandemir and Onur Mutlu and Ravishankar Iyer
                 and Chita R. Das",
  title =        "Orchestrated scheduling and prefetching for {GPGPUs}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "3",
  pages =        "332--343",
  month =        jun,
  year =         "2013",
  DOI =          "https://doi.org/10.1145/2508148.2485951",
  bibdate =      "Sat Jul 27 06:58:55 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ICSA '13 conference proceedings.",
  abstract =     "In this paper, we present techniques that coordinate
                 the thread scheduling and prefetching decisions in a
                 General Purpose Graphics Processing Unit (GPGPU)
                 architecture to better tolerate long memory latencies.
                 We demonstrate that existing warp scheduling policies
                 in GPGPU architectures are unable to effectively
                 incorporate data prefetching. The main reason is that
                 they schedule consecutive warps, which are likely to
                 access nearby cache blocks and thus prefetch accurately
                 for one another, back-to-back in consecutive cycles.
                 This either (1) causes prefetches to be generated by a
                 warp too close to the time their corresponding
                 addresses are actually demanded by another warp, or (2)
                 requires sophisticated prefetcher designs to correctly
                 predict the addresses required by a future
                 ``far-ahead'' warp while executing the current warp. We
                 propose a new prefetch-aware warp scheduling policy
                 that overcomes these problems. The key idea is to
                 separate in time the scheduling of consecutive warps
                 such that they are not executed back-to-back. We show
                 that this policy not only enables a simple prefetcher
                 to be effective in tolerating memory latencies but also
                 improves memory bank parallelism, even when prefetching
                 is not employed. Experimental evaluations across a
                 diverse set of applications on a 30-core simulated
                 GPGPU platform demonstrate that the prefetch-aware warp
                 scheduler provides 25\% and 7\% average performance
                 improvement over baselines that employ prefetching in
                 conjunction with, respectively, the commonly-employed
                 round-robin scheduler or the recently-proposed
                 two-level warp scheduler. Moreover, when prefetching is
                 not employed, the prefetch-aware warp scheduler
                 provides higher performance than both of these baseline
                 schedulers as it better exploits memory bank
                 parallelism.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Jing:2013:EES,
  author =       "Naifeng Jing and Yao Shen and Yao Lu and Shrikanth
                 Ganapathy and Zhigang Mao and Minyi Guo and Ramon Canal
                 and Xiaoyao Liang",
  title =        "An energy-efficient and scalable {eDRAM}-based
                 register file architecture for {GPGPU}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "3",
  pages =        "344--355",
  month =        jun,
  year =         "2013",
  DOI =          "https://doi.org/10.1145/2508148.2485952",
  bibdate =      "Sat Jul 27 06:58:55 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ICSA '13 conference proceedings.",
  abstract =     "The heavily-threaded data processing demands of
                 streaming multiprocessors (SM) in a GPGPU require a
                 large register file (RF). The fast increasing size of
                 the RF makes the area cost and power consumption
                 unaffordable for traditional SRAM designs in the future
                 technologies. In this paper, we propose to use
                 embedded-DRAM (eDRAM) as an alternative in future
                 GPGPUs. Compared with SRAM, eDRAM provides higher
                 density and lower leakage power. However, the limited
                 data retention time in eDRAM poses new challenges.
                 Periodic refresh operations are needed to maintain data
                 integrity. This is exacerbated with the scaling of
                 eDRAM density, process variations and temperature.
                 Unlike conventional CPUs which make use of multi-ported
                 RF, most of the RFs in modern GPGPU are heavily banked
                 but not multi-ported to reduce the hardware cost. This
                 provides a unique opportunity to hide the refresh
                 overhead. We propose two different eDRAM
                 implementations based on 3T1D and 1T1C memory cells. To
                 mitigate the impact of periodic refresh, we propose two
                 novel refresh solutions using bank bubble and bank
                 walk-through. Plus, for the 1T1C RF, we design an
                 interleaved bank organization together with an
                 intelligent warp scheduling strategy to reduce the
                 impact of the destructive reads. The analysis shows
                 that our schemes present better energy efficiency,
                 scalability and variation tolerance than traditional
                 SRAM-based designs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Rhu:2013:MSR,
  author =       "Minsoo Rhu and Mattan Erez",
  title =        "Maximizing {SIMD} resource utilization in {GPGPUs}
                 with {SIMD} lane permutation",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "3",
  pages =        "356--367",
  month =        jun,
  year =         "2013",
  DOI =          "https://doi.org/10.1145/2508148.2485953",
  bibdate =      "Sat Jul 27 06:58:55 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ICSA '13 conference proceedings.",
  abstract =     "Current GPUs maintain high programmability by
                 abstracting the SIMD nature of the hardware as
                 independent concurrent threads of control with hardware
                 responsible for generating predicate masks to utilize
                 the SIMD hardware for different flows of control. This
                 dynamic masking leads to poor utilization of SIMD
                 resources when the control of different threads in the
                 same SIMD group diverges. Prior research suggests that
                 SIMD groups be formed dynamically by compacting a large
                 number of threads into groups, mitigating the impact of
                 divergence. To maintain hardware efficiency, however,
                 the alignment of a thread to a SIMD lane is fixed,
                 limiting the potential for compaction. We observe that
                 control frequently diverges in a manner that prevents
                 compaction because of the way in which the fixed
                 alignment of threads to lanes is done. This paper
                 presents an in-depth analysis on the causes for
                 ineffective compaction. An important observation is
                 that in many cases, control diverges because of
                 programmatic branches, which do not depend on input
                 data. This behavior, when combined with the default
                 mapping of threads to lanes, severely restricts
                 compaction. We then propose SIMD lane permutation (SLP)
                 as an optimization to expand the applicability of
                 compaction in such cases of lane alignment. SLP seeks
                 to rearrange how threads are mapped to lanes to allow
                 even programmatic branches to be compacted effectively,
                 improving SIMD utilization up to 34\% accompanied by a
                 maximum 25\% performance boost.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Vaidya:2013:SDO,
  author =       "Aniruddha S. Vaidya and Anahita Shayesteh and Dong
                 Hyuk Woo and Roy Saharoy and Mani Azimi",
  title =        "{SIMD} divergence optimization through intra-warp
                 compaction",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "3",
  pages =        "368--379",
  month =        jun,
  year =         "2013",
  DOI =          "https://doi.org/10.1145/2508148.2485954",
  bibdate =      "Sat Jul 27 06:58:55 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ICSA '13 conference proceedings.",
  abstract =     "SIMD execution units in GPUs are increasingly used for
                 high performance and energy efficient acceleration of
                 general purpose applications. However, SIMD control
                 flow divergence effects can result in reduced execution
                 efficiency in a class of GPGPU applications, classified
                 as divergent applications. Improving SIMD efficiency,
                 therefore, has the potential to bring significant
                 performance and energy benefits to a wide range of such
                 data parallel applications. Recently, the SIMD
                 divergence problem has received increased attention,
                 and several micro-architectural techniques have been
                 proposed to address various aspects of this problem.
                 However, these techniques are often quite complex and,
                 therefore, unlikely candidates for practical
                 implementation. In this paper, we propose two
                 micro-architectural optimizations for GPGPU
                 architectures, which utilize relatively simple
                 execution cycle compression techniques when certain
                 groups of turned-off lanes exist in the instruction
                 stream. We refer to these optimizations as basic cycle
                 compression (BCC) and swizzled-cycle compression (SCC),
                 respectively. In this paper, we will outline the
                 additional requirements for implementing these
                 optimizations in the context of the studied GPGPU
                 architecture. Our evaluations with divergent SIMD
                 workloads from OpenCL (GPGPU) and OpenGL (graphics)
                 applications show that BCC and SCC reduce execution
                 cycles in divergent applications by as much as 42\%
                 (20\% on average). For a subset of divergent workloads,
                 the execution time is reduced by an average of 7\% for
                 today's GPUs or by 18\% for future GPUs with a better
                 provisioned memory subsystem. The key contribution of
                 our work is in simplifying the micro-architecture for
                 delivering divergence optimizations while providing the
                 bulk of the benefits of more complex approaches.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Son:2013:RMA,
  author =       "Young Hoon Son and O. Seongil and Yuhwan Ro and Jae W.
                 Lee and Jung Ho Ahn",
  title =        "Reducing memory access latency with asymmetric {DRAM}
                 bank organizations",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "3",
  pages =        "380--391",
  month =        jun,
  year =         "2013",
  DOI =          "https://doi.org/10.1145/2508148.2485955",
  bibdate =      "Sat Jul 27 06:58:55 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ICSA '13 conference proceedings.",
  abstract =     "DRAM has been a de facto standard for main memory, and
                 advances in process technology have led to a rapid
                 increase in its capacity and bandwidth. In contrast,
                 its random access latency has remained relatively
                 stagnant, as it is still around 100 CPU clock cycles.
                 Modern computer systems rely on caches or other latency
                 tolerance techniques to lower the average access
                 latency. However, not all applications have ample
                 parallelism or locality that would help hide or reduce
                 the latency. Moreover, applications' demands for memory
                 space continue to grow, while the capacity gap between
                 last-level caches and main memory is unlikely to
                 shrink. Consequently, reducing the main-memory latency
                 is important for application performance.
                 Unfortunately, previous proposals have not adequately
                 addressed this problem, as they have focused only on
                 improving the bandwidth and capacity or reduced the
                 latency at the cost of significant area overhead. We
                 propose asymmetric DRAM bank organizations to reduce
                 the average main-memory access latency. We first
                 analyze the access and cycle times of a modern DRAM
                 device to identify key delay components for latency
                 reduction. Then we reorganize a subset of DRAM banks to
                 reduce their access and cycle times by half with low
                 area overhead. By synergistically combining these
                 reorganized DRAM banks with support for non-uniform
                 bank accesses, we introduce a novel DRAM bank
                 organization with center high-aspect-ratio mats called
                 CHARM. Experiments on a simulated chip-multiprocessor
                 system show that CHARM improves both the instructions
                 per cycle and system-wide energy-delay product up to
                 21\% and 32\%, respectively, with only a 3\% increase
                 in die area.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Liu:2013:CTP,
  author =       "Ziyi Liu and JongHyuk Lee and Junyuan Zeng and
                 Yuanfeng Wen and Zhiqiang Lin and Weidong Shi",
  title =        "{CPU} transparent protection of {OS} kernel and
                 hypervisor integrity with programmable {DRAM}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "3",
  pages =        "392--403",
  month =        jun,
  year =         "2013",
  DOI =          "https://doi.org/10.1145/2508148.2485956",
  bibdate =      "Sat Jul 27 06:58:55 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ICSA '13 conference proceedings.",
  abstract =     "Increasingly, cyber attacks (e.g., kernel rootkits)
                 target the inner rings of a computer system, and they
                 have seriously undermined the integrity of the entire
                 computer systems. To eliminate these threats, it is
                 imperative to develop innovative solutions running
                 below the attack surface. This paper presents MGuard, a
                 new most inner ring solution for inspecting the system
                 integrity that is directly integrated with the DRAM
                 DIMM devices. More specifically, we design a
                 programmable guard that is integrated with the advanced
                 memory buffer of FB-DIMM to continuously monitor all
                 the memory traffic and detect the system integrity
                 violations. Unlike the existing approaches that are
                 either snapshot-based or lack compatibility and
                 flexibility, MGuard continuously monitors the integrity
                 of all the outer rings including both OS kernel and
                 hypervisor of interest, with a greater extendibility
                 enabled by a programmable interface. It offers a
                 hardware drop-in solution transparent to the host CPU
                 and memory controller. Moreover, MGuard is isolated
                 from the host software and hardware, leading to strong
                 security for remote attackers. Our simulation-based
                 experimental results show that MGuard introduces no
                 speed overhead, and is able to detect nearly all the
                 OS-kernel and hypervisor control data related rootkits
                 we tested.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Jevdjic:2013:SDC,
  author =       "Djordje Jevdjic and Stavros Volos and Babak Falsafi",
  title =        "Die-stacked {DRAM} caches for servers: hit ratio,
                 latency, or bandwidth? {Have} it all with footprint
                 cache",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "3",
  pages =        "404--415",
  month =        jun,
  year =         "2013",
  DOI =          "https://doi.org/10.1145/2508148.2485957",
  bibdate =      "Sat Jul 27 06:58:55 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ICSA '13 conference proceedings.",
  abstract =     "Recent research advocates using large die-stacked DRAM
                 caches to break the memory bandwidth wall. Existing
                 DRAM cache designs fall into one of two categories ---
                 block-based and page-based. The former organize data in
                 conventional blocks (e.g., 64B), ensuring low off-chip
                 bandwidth utilization, but co-locate tags and data in
                 the stacked DRAM, incurring high lookup latency.
                 Furthermore, such designs suffer from low hit ratios
                 due to poor temporal locality. In contrast, page-based
                 caches, which manage data at larger granularity (e.g.,
                 4KB pages), allow for reduced tag array overhead and
                 fast lookup, and leverage high spatial locality at the
                 cost of moving large amounts of data on and off the
                 chip. This paper introduces Footprint Cache, an
                 efficient die-stacked DRAM cache design for server
                 processors. Footprint Cache allocates data at the
                 granularity of pages, but identifies and fetches only
                 those blocks within a page that will be touched during
                 the page's residency in the cache --- i.e., the page's
                 footprint. In doing so, Footprint Cache eliminates the
                 excessive off-chip traffic associated with page-based
                 designs, while preserving their high hit ratio, small
                 tag array overhead, and low lookup latency.
                 Cycle-accurate simulation results of a 16-core server
                 with up to 512MB Footprint Cache indicate a 57\%
                 performance improvement over a baseline chip without a
                 die-stacked cache. Compared to a state-of-the-art
                 block-based design, our design improves performance by
                 13\% while reducing dynamic energy of stacked DRAM by
                 24\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Sim:2013:RSD,
  author =       "Jaewoong Sim and Gabriel H. Loh and Vilas Sridharan
                 and Mike O'Connor",
  title =        "Resilient die-stacked {DRAM} caches",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "3",
  pages =        "416--427",
  month =        jun,
  year =         "2013",
  DOI =          "https://doi.org/10.1145/2508148.2485958",
  bibdate =      "Sat Jul 27 06:58:55 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ICSA '13 conference proceedings.",
  abstract =     "Die-stacked DRAM can provide large amounts of
                 in-package, high-bandwidth cache storage. For server
                 and high-performance computing markets, however, such
                 DRAM caches must also provide sufficient support for
                 reliability and fault tolerance. While conventional
                 off-chip memory provides ECC support by adding one or
                 more extra chips, this may not be practical in a 3D
                 stack. In this paper, we present a DRAM cache
                 organization that uses error-correcting codes (ECCs),
                 strong checksums (CRCs), and dirty data duplication to
                 detect and correct a wide range of stacked DRAM
                 failures, from traditional bit errors to large-scale
                 row, column, bank, and channel failures. With only a
                 modest performance degradation compared to a DRAM cache
                 with no ECC support, our proposal can correct all
                 single-bit failures, and 99.9993\% of all row, column,
                 and bank failures, providing more than a 54,000x
                 improvement in the FIT rate of silent-data corruptions
                 compared to basic SECDED ECC protection.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Du:2013:BMB,
  author =       "Yu Du and Miao Zhou and Bruce R. Childers and Daniel
                 Moss{\'e} and Rami Melhem",
  title =        "Bit mapping for balanced {PCM} cell programming",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "3",
  pages =        "428--439",
  month =        jun,
  year =         "2013",
  DOI =          "https://doi.org/10.1145/2508148.2485959",
  bibdate =      "Sat Jul 27 06:58:55 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ICSA '13 conference proceedings.",
  abstract =     "Write bandwidth is an inherent performance bottleneck
                 for Phase Change Memory (PCM) for two reasons. First,
                 PCM cells have long programming time, and second, only
                 a limited number of PCM cells can be programmed
                 concurrently due to programming current and write
                 circuit constraints, For each PCM write, the data bits
                 of the write request are typically mapped to multiple
                 cell groups and processed in parallel. We observed that
                 an unbalanced distribution of modified data bits among
                 cell groups significantly increases PCM write time and
                 hurts effective write bandwidth. To address this issue,
                 we first uncover the cyclical and cluster patterns for
                 modified data bits. Next, we propose double XOR mapping
                 (D-XOR) to distribute modified data bits among cell
                 groups in a balanced way. D-XOR can reduce PCM write
                 service time by 45\% on average, which increases PCM
                 write throughput by 1.8x. As error correction
                 (redundant bits) is critical for PCM, we also consider
                 the impact of redundancy information in mapping data
                 and error correction bits to cell groups. Our
                 techniques lead to a 51\% average reduction in write
                 service time for a PCM main memory with ECC, which
                 increases IPC by 12\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Seong:2013:TLC,
  author =       "Nak Hee Seong and Sungkap Yeo and Hsien-Hsin S. Lee",
  title =        "Tri-level-cell phase change memory: toward an
                 efficient and reliable memory system",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "3",
  pages =        "440--451",
  month =        jun,
  year =         "2013",
  DOI =          "https://doi.org/10.1145/2508148.2485960",
  bibdate =      "Sat Jul 27 06:58:55 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ICSA '13 conference proceedings.",
  abstract =     "There are several emerging memory technologies looming
                 on the horizon to compensate the physical scaling
                 challenges of DRAM. Phase change memory (PCM) is one
                 such candidate proposed for being part of the main
                 memory in computing systems. One salient feature of PCM
                 is its multi-level-cell (MLC) property, which can be
                 used to multiply the memory capacity at the cell level.
                 However, due to the nature of PCM that the value
                 written to the cell can drift over time, PCM is prone
                 to a unique type of soft errors, posing a great
                 challenge for their practical deployment. This paper
                 first quantitatively studied the current art for MLC
                 PCM in dealing with the resistance drift problem and
                 showed that the previously proposed techniques such as
                 scrubbing or error correction mechanisms have
                 significant reliability challenges to overcome. We then
                 propose tri-level-cell PCM and demonstrate its ability
                 to achieving 10$^5$ x lower soft error rate than
                 four-level-cell PCM and 1.33 x higher information
                 density than single-level-cell PCM. According to our
                 findings, the tri-level-cell PCM shows 36.4\%
                 performance improvement over the four-level-cell PCM
                 while achieving the soft error rate of DRAM.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Azevedo:2013:ZME,
  author =       "Rodolfo Azevedo and John D. Davis and Karin Strauss
                 and Parikshit Gopalan and Mark Manasse and Sergey
                 Yekhanin",
  title =        "Zombie memory: extending memory lifetime by reviving
                 dead blocks",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "3",
  pages =        "452--463",
  month =        jun,
  year =         "2013",
  DOI =          "https://doi.org/10.1145/2508148.2485961",
  bibdate =      "Sat Jul 27 06:58:55 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ICSA '13 conference proceedings.",
  abstract =     "Zombie is an endurance management framework that
                 enables a variety of error correction mechanisms to
                 extend the lifetimes of memories that suffer from bit
                 failures caused by wearout, such as phase-change memory
                 (PCM). Zombie supports both single-level cell (SLC) and
                 multi-level cell (MLC) variants. It extends the
                 lifetime of blocks in working memory pages (primary
                 blocks) by pairing them with spare blocks, i.e.,
                 working blocks in pages that have been disabled due to
                 exhaustion of a single block's error correction
                 resources, which would be 'dead' otherwise. Spare
                 blocks adaptively provide error correction resources to
                 primary blocks as failures accumulate over time. This
                 reduces the waste caused by early block failures,
                 making working blocks in discarded pages a useful
                 resource. Even though we use PCM as the target
                 technology, Zombie applies to any memory technology
                 that suffers stuck-at cell failures. This paper
                 describes the Zombie framework, a combination of two
                 new error correction mechanisms (ZombieXOR for SLC and
                 ZombieMLC for MLC) and the extension of two previously
                 proposed SLC mechanisms (ZombieECP and ZombieERC). The
                 result is a 58\% to 92\% improvement in endurance for
                 Zombie SLC memory and an even more impressive 11x to
                 17x improvement for ZombieMLC, both with performance
                 overheads of only 0.1\% when memories using prior error
                 correction mechanisms reach end of life.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Caulfield:2013:QSA,
  author =       "Adrian M. Caulfield and Steven Swanson",
  title =        "{QuickSAN}: a storage area network for fast,
                 distributed, solid state disks",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "3",
  pages =        "464--474",
  month =        jun,
  year =         "2013",
  DOI =          "https://doi.org/10.1145/2508148.2485962",
  bibdate =      "Sat Jul 27 06:58:55 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ICSA '13 conference proceedings.",
  abstract =     "Solid State Disks (SSDs) based on flash and other
                 non-volatile memory technologies reduce storage
                 latencies from 10s of milliseconds to 10s or 100s of
                 microseconds, transforming previously inconsequential
                 storage overheads into performance bottlenecks. This
                 problem is especially acute in storage area network
                 (SAN) environments where complex hardware and software
                 layers (distributed file systems, block severs, network
                 stacks, etc.) lie between applications and remote data.
                 These layers can add hundreds of microseconds to
                 requests, obscuring the performance of both flash
                 memory and faster, emerging non-volatile memory
                 technologies. We describe QuickSAN, a SAN prototype
                 that eliminates most software overheads and
                 significantly reduces hardware overheads in SANs.
                 QuickSAN integrates a network adapter into SSDs, so the
                 SSDs can communicate directly with one another to
                 service storage accesses as quickly as possible.
                 QuickSAN can also give applications direct access to
                 both local and remote data without operating system
                 intervention, further reducing software costs. Our
                 evaluation of QuickSAN demonstrates remote access
                 latencies of 20 $ \mu $ s for 4 KB requests, bandwidth
                 improvements of as much as 163x for small accesses
                 compared with an equivalent iSCSI implementation, and
                 2.3--3.0x application level speedup for distributed
                 sorting. We also show that QuickSAN improves energy
                 efficiency by up to 96\% and that QuickSAN's networking
                 connectivity allows for improved cluster-level energy
                 efficiency under varying load.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Sanchez:2013:ZFA,
  author =       "Daniel Sanchez and Christos Kozyrakis",
  title =        "{ZSim}: fast and accurate microarchitectural
                 simulation of thousand-core systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "3",
  pages =        "475--486",
  month =        jun,
  year =         "2013",
  DOI =          "https://doi.org/10.1145/2508148.2485963",
  bibdate =      "Sat Jul 27 06:58:55 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  note =         "ICSA '13 conference proceedings.",
  abstract =     "Architectural simulation is time-consuming, and the
                 trend towards hundreds of cores is making sequential
                 simulation even slower. Existing parallel simulation
                 techniques either scale poorly due to excessive
                 synchronization, or sacrifice accuracy by allowing
                 event reordering and using simplistic contention
                 models. As a result, most researchers use sequential
                 simulators and model small-scale systems with 16--32
                 cores. With 100-core chips already available,
                 developing simulators that scale to thousands of cores
                 is crucial. We present three novel techniques that,
                 together, make thousand-core simulation practical.
                 First, we speed up detailed core models (including OOO
                 cores) with instruction-driven timing models that
                 leverage dynamic binary translation. Second, we
                 introduce bound-weave, a two-phase parallelization
                 technique that scales parallel simulation on multicore
                 hosts efficiently with minimal loss of accuracy. Third,
                 we implement lightweight user-level virtualization to
                 support complex workloads, including multiprogrammed,
                 client-server, and managed-runtime applications,
                 without the need for full-system simulation,
                 sidestepping the lack of scalable OSs and ISAs that
                 support thousands of cores. We use these techniques to
                 build zsim, a fast, scalable, and accurate simulator.
                 On a 16-core host, zsim models a 1024-core chip at
                 speeds of up to 1,500 MIPS using simple cores and up to
                 300 MIPS using detailed OOO cores, 2-3 orders of
                 magnitude faster than existing parallel simulators.
                 Simulator performance scales well with both the number
                 of modeled cores and the number of host cores. We
                 validate zsim against a real Westmere system on a wide
                 variety of workloads, and find performance and
                 microarchitectural events to be within a narrow range
                 of the real system.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Leng:2013:GEE,
  author =       "Jingwen Leng and Tayler Hetherington and Ahmed
                 ElTantawy and Syed Gilani and Nam Sung Kim and Tor M.
                 Aamodt and Vijay Janapa Reddi",
  title =        "{GPUWattch}: enabling energy optimizations in
                 {GPGPUs}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "3",
  pages =        "487--498",
  month =        jun,
  year =         "2013",
  DOI =          "https://doi.org/10.1145/2508148.2485964",
  bibdate =      "Sat Jul 27 06:58:55 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ICSA '13 conference proceedings.",
  abstract =     "General-purpose GPUs (GPGPUs) are becoming prevalent
                 in mainstream computing, and performance per watt has
                 emerged as a more crucial evaluation metric than peak
                 performance. As such, GPU architects require robust
                 tools that will enable them to quickly explore new ways
                 to optimize GPGPUs for energy efficiency. We propose a
                 new GPGPU power model that is configurable, capable of
                 cycle-level calculations, and carefully validated
                 against real hardware measurements. To achieve
                 configurability, we use a bottom-up methodology and
                 abstract parameters from the microarchitectural
                 components as the model's inputs. We developed a
                 rigorous suite of 80 microbenchmarks that we use to
                 bound any modeling uncertainties and inaccuracies. The
                 power model is comprehensively validated against
                 measurements of two commercially available GPUs, and
                 the measured error is within 9.9\% and 13.4\% for the
                 two target GPUs (GTX 480 and Quadro FX5600). The model
                 also accurately tracks the power consumption trend over
                 time. We integrated the power model with the
                 cycle-level simulator GPGPU-Sim and demonstrate the
                 energy savings by utilizing dynamic voltage and
                 frequency scaling (DVFS) and clock gating. Traditional
                 DVFS reduces GPU energy consumption by 14.4\% by
                 leveraging within-kernel runtime variations. More
                 finer-grained SM cluster-level DVFS improves the energy
                 savings from 6.6\% to 13.6\% for those benchmarks that
                 show clustered execution behavior. We also show that
                 clock gating inactive lanes during divergence reduces
                 dynamic power by 11.2\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Wu:2013:SMP,
  author =       "Meng-Ju Wu and Minshu Zhao and Donald Yeung",
  title =        "Studying multicore processor scaling via reuse
                 distance analysis",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "3",
  pages =        "499--510",
  month =        jun,
  year =         "2013",
  DOI =          "https://doi.org/10.1145/2508148.2485965",
  bibdate =      "Sat Jul 27 06:58:55 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ICSA '13 conference proceedings.",
  abstract =     "The trend for multicore processors is towards
                 increasing numbers of cores, with 100s of cores-- i.e.
                 large-scale chip multiprocessors (LCMPs) --- possible
                 in the future. The key to realizing the potential of
                 LCMPs is the cache hierarchy, so studying how memory
                 performance will scale is crucial. Reuse distance (RD)
                 analysis can help architects do this. In particular,
                 recent work has developed concurrent reuse distance
                 (CRD) and private reuse distance (PRD) profiles to
                 enable analysis of shared and private caches. Also,
                 techniques have been developed to predict profiles
                 across problem size and core count, enabling the
                 analysis of configurations that are too large to
                 simulate. This paper applies RD analysis to study the
                 scalability of multicore cache hierarchies. We present
                 a framework based on CRD and PRD profiles for reasoning
                 about the locality impact of core count and problem
                 scaling. We find interference-based locality
                 degradation is more significant than sharing-based
                 locality degradation. For 256 cores running small
                 problems, the former occurs at small cache sizes,
                 allowing moderate capacity scaling of multicore caches
                 to achieve the same cache performance (MPKI) as a
                 single-core cache. At very large problems,
                 interference-based locality degradation increases
                 significantly in many of our benchmarks. For shared
                 caches, this prevents most of our benchmarks from
                 achieving constant-MPKI scaling within a 256 MB
                 capacity budget; for private caches, all benchmarks
                 cannot achieve constant-MPKI scaling within 256 MB.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{DuBois:2013:CSI,
  author =       "Kristof {Du Bois} and Stijn Eyerman and Jennifer B.
                 Sartor and Lieven Eeckhout",
  title =        "Criticality stacks: identifying critical threads in
                 parallel programs using synchronization behavior",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "3",
  pages =        "511--522",
  month =        jun,
  year =         "2013",
  DOI =          "https://doi.org/10.1145/2508148.2485966",
  bibdate =      "Sat Jul 27 06:58:55 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ICSA '13 conference proceedings.",
  abstract =     "Analyzing multi-threaded programs is quite
                 challenging, but is necessary to obtain good multicore
                 performance while saving energy. Due to
                 synchronization, certain threads make others wait,
                 because they hold a lock or have yet to reach a
                 barrier. We call these critical threads, i.e., threads
                 whose performance is determinative of program
                 performance as a whole. Identifying these threads can
                 reveal numerous optimization opportunities, for the
                 software developer and for hardware. In this paper, we
                 propose a new metric for assessing thread criticality,
                 which combines both how much time a thread is
                 performing useful work and how many co-running threads
                 are waiting. We show how thread criticality can be
                 calculated online with modest hardware additions and
                 with low overhead. We use our metric to create
                 criticality stacks that break total execution time into
                 each thread's criticality component, allowing for easy
                 visual analysis of parallel imbalance. To validate our
                 criticality metric, and demonstrate it is better than
                 previous metrics, we scale the frequency of the most
                 critical thread and show it achieves the largest
                 performance improvement. We then demonstrate the broad
                 applicability of criticality stacks by using them to
                 perform three types of optimizations: (1) program
                 analysis to remove parallel bottlenecks, (2)
                 dynamically identifying the most critical thread and
                 accelerating it using frequency scaling to improve
                 performance, and (3) showing that accelerating only the
                 most critical thread allows for targeted energy
                 reduction.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kurian:2013:LAA,
  author =       "George Kurian and Omer Khan and Srinivas Devadas",
  title =        "The locality-aware adaptive cache coherence protocol",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "3",
  pages =        "523--534",
  month =        jun,
  year =         "2013",
  DOI =          "https://doi.org/10.1145/2508148.2485967",
  bibdate =      "Sat Jul 27 06:58:55 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ICSA '13 conference proceedings.",
  abstract =     "Next generation multicore applications will process
                 massive amounts of data with significant sharing. Data
                 movement and management impacts memory access latency
                 and consumes power. Therefore, harnessing data locality
                 is of fundamental importance in future processors. We
                 propose a scalable, efficient shared memory cache
                 coherence protocol that enables seamless adaptation
                 between private and logically shared caching of on-chip
                 data at the fine granularity of cache lines. Our
                 data-centric approach relies on in-hardware yet
                 low-overhead runtime profiling of the locality of each
                 cache line and only allows private caching for data
                 blocks with high spatio-temporal locality. This allows
                 us to better exploit the private caches and enable
                 low-latency, low-energy memory access, while retaining
                 the convenience of shared memory. On a set of parallel
                 benchmarks, our low-overhead locality-aware mechanisms
                 reduce the overall energy by 25\% and completion time
                 by 15\% in an NoC-based multicore with the
                 Reactive-NUCA on-chip cache organization and the
                 ACKwise limited directory-based coherence protocol.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kaxiras:2013:NPE,
  author =       "Stefanos Kaxiras and Alberto Ros",
  title =        "A new perspective for efficient virtual-cache
                 coherence",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "3",
  pages =        "535--546",
  month =        jun,
  year =         "2013",
  DOI =          "https://doi.org/10.1145/2508148.2485968",
  bibdate =      "Sat Jul 27 06:58:55 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ICSA '13 conference proceedings.",
  abstract =     "Coherent shared virtual memory (cSVM) is highly
                 coveted for heterogeneous architectures as it will
                 simplify programming across different cores and
                 manycore accelerators. In this context, virtual L1
                 caches can be used to great advantage, e.g., saving
                 energy consumption by eliminating address translation
                 for hits. Unfortunately, multicore virtual-cache
                 coherence is complex and costly because it requires
                 reverse translation for any coherence request directed
                 towards a virtual L1. The reason is the ambiguity of
                 the virtual address due to the possibility of synonyms.
                 In this paper, we take a radically different approach
                 than all prior work which is focused on reverse
                 translation. We examine the problem from the
                 perspective of the coherence protocol. We show that if
                 a coherence protocol adheres to certain conditions, it
                 operates effortlessly with virtual caches, without
                 requiring reverse translations even in the presence of
                 synonyms. We show that these conditions hold in a new
                 class of simple and efficient request-response
                 protocols that use both self-invalidation and
                 self-downgrade. This results in a new solution for
                 virtual-cache coherence, significantly less complex and
                 more efficient than prior proposals. We study design
                 choices for TLB placement under our proposal and
                 compare them against those under a directory-MESI
                 protocol. Our approach allows for choices that are
                 particularly effective as for example combining all
                 per-core TLBs in a single logical TLB in front of the
                 last level cache. Significant area, energy, and
                 performance benefits ensue as a result of simplifying
                 the entire multicore memory organization.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Zhao:2013:PAG,
  author =       "Hongzhou Zhao and Arrvindh Shriraman and Snehasish
                 Kumar and Sandhya Dwarkadas",
  title =        "{Protozoa}: adaptive granularity cache coherence",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "3",
  pages =        "547--558",
  month =        jun,
  year =         "2013",
  DOI =          "https://doi.org/10.1145/2508148.2485969",
  bibdate =      "Sat Jul 27 06:58:55 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ICSA '13 conference proceedings.",
  abstract =     "State-of-the-art multiprocessor cache hierarchies
                 propagate the use of a fixed granularity in the cache
                 organization to the design of the coherence protocol.
                 Unfortunately, the fixed granularity, generally chosen
                 to match average spatial locality across a range of
                 applications, not only results in wasted bandwidth to
                 serve an individual thread's access needs, but also
                 results in unnecessary coherence traffic for shared
                 data. The additional bandwidth has a direct impact on
                 both the scalability of parallel applications and
                 overall energy consumption. In this paper, we present
                 the design of Protozoa, a family of coherence protocols
                 that eliminate unnecessary coherence traffic and match
                 data movement to an application's spatial locality.
                 Protozoa continues to maintain metadata at a
                 conventional fixed cache line granularity while (1)
                 supporting variable read and write caching granularity
                 so that data transfer matches application spatial
                 granularity, (2) invalidating at the granularity of the
                 write miss request so that readers to disjoint data can
                 co-exist with writers, and (3) potentially supporting
                 multiple non-overlapping writers within the cache line,
                 thereby avoiding the traditional ping-pong effect of
                 both read-write and write-write false sharing. Our
                 evaluation demonstrates that Protozoa consistently
                 reduce miss rate and improve the fraction of
                 transmitted data that is actually utilized.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Demme:2013:FOM,
  author =       "John Demme and Matthew Maycock and Jared Schmitz and
                 Adrian Tang and Adam Waksman and Simha Sethumadhavan
                 and Salvatore Stolfo",
  title =        "On the feasibility of online malware detection with
                 performance counters",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "3",
  pages =        "559--570",
  month =        jun,
  year =         "2013",
  DOI =          "https://doi.org/10.1145/2508148.2485970",
  bibdate =      "Sat Jul 27 06:58:55 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ICSA '13 conference proceedings.",
  abstract =     "The proliferation of computers in any domain is
                 followed by the proliferation of malware in that
                 domain. Systems, including the latest mobile platforms,
                 are laden with viruses, rootkits, spyware, adware and
                 other classes of malware. Despite the existence of
                 anti-virus software, malware threats persist and are
                 growing as there exist a myriad of ways to subvert
                 anti-virus (AV) software. In fact, attackers today
                 exploit bugs in the AV software to break into systems.
                 In this paper, we examine the feasibility of building a
                 malware detector in hardware using existing performance
                 counters. We find that data from performance counters
                 can be used to identify malware and that our detection
                 techniques are robust to minor variations in malware
                 programs. As a result, after examining a small set of
                 variations within a family of malware on Android ARM
                 and Intel Linux platforms, we can detect many
                 variations within that family. Further, our proposed
                 hardware modifications allow the malware detector to
                 run securely beneath the system software, thus setting
                 the stage for AV implementations that are simpler and
                 less buggy than software AV. Combined, the robustness
                 and security of hardware AV techniques have the
                 potential to advance state-of-the-art online malware
                 detection.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ren:2013:DSE,
  author =       "Ling Ren and Xiangyao Yu and Christopher W. Fletcher
                 and Marten van Dijk and Srinivas Devadas",
  title =        "Design space exploration and optimization of path
                 oblivious {RAM} in secure processors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "3",
  pages =        "571--582",
  month =        jun,
  year =         "2013",
  DOI =          "https://doi.org/10.1145/2508148.2485971",
  bibdate =      "Sat Jul 27 06:58:55 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ICSA '13 conference proceedings.",
  abstract =     "Keeping user data private is a huge problem both in
                 cloud computing and computation outsourcing. One
                 paradigm to achieve data privacy is to use
                 tamper-resistant processors, inside which users'
                 private data is decrypted and computed upon. These
                 processors need to interact with untrusted external
                 memory. Even if we encrypt all data that leaves the
                 trusted processor, however, the address sequence that
                 goes off-chip may still leak information. To prevent
                 this address leakage, the security community has
                 proposed ORAM (Oblivious RAM). ORAM has mainly been
                 explored in server/file settings which assume a vastly
                 different computation model than secure processors. Not
                 surprisingly, na{\"\i}vely applying ORAM to a secure
                 processor setting incurs large performance overheads.
                 In this paper, a recent proposal called Path ORAM is
                 studied. We demonstrate techniques to make Path ORAM
                 practical in a secure processor setting. We introduce
                 background eviction schemes to prevent Path ORAM
                 failure and allow for a performance-driven design space
                 exploration. We propose a concept called super blocks
                 to further improve Path ORAM's performance, and also
                 show an efficient integrity verification scheme for
                 Path ORAM. With our optimizations, Path ORAM overhead
                 drops by 41.8\%, and SPEC benchmark execution time
                 improves by 52.4\% in relation to a baseline
                 configuration. Our work can be used to improve the
                 security level of previous secure processors.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Wassel:2013:SLL,
  author =       "Hassan M. G. Wassel and Ying Gao and Jason K. Oberg
                 and Ted Huffmire and Ryan Kastner and Frederic T. Chong
                 and Timothy Sherwood",
  title =        "{SurfNoC}: a low latency and provably non-interfering
                 approach to secure networks-on-chip",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "3",
  pages =        "583--594",
  month =        jun,
  year =         "2013",
  DOI =          "https://doi.org/10.1145/2508148.2485972",
  bibdate =      "Sat Jul 27 06:58:55 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ICSA '13 conference proceedings.",
  abstract =     "As multicore processors find increasing adoption in
                 domains such as aerospace and medical devices where
                 failures have the potential to be catastrophic, strong
                 performance isolation and security become first-class
                 design constraints. When cores are used to run separate
                 pieces of the system, strong time and space
                 partitioning can help provide such guarantees. However,
                 as the number of partitions or the asymmetry in
                 partition bandwidth allocations grows, the additional
                 latency incurred by time multiplexing the network can
                 significantly impact performance. In this paper, we
                 introduce SurfNoC, an on-chip network that
                 significantly reduces the latency incurred by temporal
                 partitioning. By carefully scheduling the network into
                 waves that flow across the interconnect, data from
                 different domains carried by these waves are strictly
                 non-interfering while avoiding the significant
                 overheads associated with cycle-by-cycle time
                 multiplexing. We describe the scheduling policy and
                 router microarchitecture changes required, and evaluate
                 the information-flow security of a synthesizable
                 implementation through gate-level information flow
                 analysis. When comparing our approach for varying
                 numbers of domains and network sizes, we find that in
                 many cases SurfNoC can reduce the latency overhead of
                 implementing cycle-level non-interference by up to
                 85\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Wang:2013:VPD,
  author =       "Di Wang and Chuangang Ren and Anand Sivasubramaniam",
  title =        "Virtualizing power distribution in datacenters",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "3",
  pages =        "595--606",
  month =        jun,
  year =         "2013",
  DOI =          "https://doi.org/10.1145/2508148.2485973",
  bibdate =      "Sat Jul 27 06:58:55 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  note =         "ICSA '13 conference proceedings.",
  abstract =     "Power infrastructure contributes to a significant
                 portion of datacenter expenditures. Overbooking this
                 infrastructure for a high percentile of the needs is
                 becoming more attractive than for occasional peaks.
                 There exist several computing knobs to cap the power
                 draw within such under-provisioned capacity. Recently,
                 batteries and other energy storage devices have been
                 proposed to provide a complementary alternative to
                 these knobs, which when decentralized (or
                 hierarchically placed), can temporarily take the load
                 to suppress power peaks propagating up the hierarchy.
                 With aggressive under-provisioning, the power hierarchy
                 becomes as central a datacenter resource as other
                 computing resources, making it imperative to carefully
                 allocate, isolate and manage this resource (including
                 batteries), across applications. Towards this goal, we
                 present vPower, a software system to virtualize power
                 distribution. vPower includes mechanisms and policies
                 to provide a virtual power hierarchy for each
                 application. It leverages traditional computing knobs
                 as well as batteries, to apportion and manage the
                 infrastructure between co-existing applications in the
                 hierarchy. vPower allows applications to specify their
                 power needs, performs admission control and placement,
                 dynamically monitors power usage, and enforces
                 allocations for fairness and system efficiency. Using
                 several datacenter applications, and a 2-level power
                 hierarchy prototype containing batteries at both
                 levels, we demonstrate the effectiveness of vPower when
                 working in an under-provisioned power infrastructure,
                 using the right computing knobs and the right batteries
                 at the right time. Results show over 50\% improved
                 system utilization and scale-out for vPower's
                 over-booking, and between 12--28\% better application
                 performance than traditional power-capping control
                 knobs. It also ensures isolation between applications
                 competing for power.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Yang:2013:BFP,
  author =       "Hailong Yang and Alex Breslow and Jason Mars and
                 Lingjia Tang",
  title =        "{Bubble-Flux}: precise online {QoS} management for
                 increased utilization in warehouse scale computers",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "3",
  pages =        "607--618",
  month =        jun,
  year =         "2013",
  DOI =          "https://doi.org/10.1145/2508148.2485974",
  bibdate =      "Sat Jul 27 06:58:55 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ICSA '13 conference proceedings.",
  abstract =     "Ensuring the quality of service (QoS) for
                 latency-sensitive applications while allowing
                 co-locations of multiple applications on servers is
                 critical for improving server utilization and reducing
                 cost in modern warehouse-scale computers (WSCs). Recent
                 work relies on static profiling to precisely predict
                 the QoS degradation that results from performance
                 interference among co-running applications to increase
                 the number of ``safe'' co-locations. However, these
                 static profiling techniques have several critical
                 limitations: (1) a priori knowledge of all workloads is
                 required for profiling, (2) it is difficult for the
                 prediction to capture or adapt to phase or load changes
                 of applications, and (3) the prediction technique is
                 limited to only two co-running applications. To address
                 all of these limitations, we present Bubble-Flux, an
                 integrated dynamic interference measurement and online
                 QoS management mechanism to provide accurate QoS
                 control and maximize server utilization. Bubble-Flux
                 uses a Dynamic Bubble to probe servers in real time to
                 measure the instantaneous pressure on the shared
                 hardware resources and precisely predict how the QoS of
                 a latency-sensitive job will be affected by potential
                 co-runners. Once ``safe'' batch jobs are selected and
                 mapped to a server, Bubble-Flux uses an Online Flux
                 Engine to continuously monitor the QoS of the
                 latency-sensitive application and control the execution
                 of batch jobs to adapt to dynamic input, phase, and
                 load changes to deliver satisfactory QoS. Batch
                 applications remain in a state of flux throughout
                 execution. Our results show that the utilization
                 improvement achieved by Bubble-Flux is up to 2.2x
                 better than the prior static approach.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Mars:2013:WMH,
  author =       "Jason Mars and Lingjia Tang",
  title =        "{Whare-map}: heterogeneity in ``homogeneous''
                 warehouse-scale computers",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "3",
  pages =        "619--630",
  month =        jun,
  year =         "2013",
  DOI =          "https://doi.org/10.1145/2508148.2485975",
  bibdate =      "Sat Jul 27 06:58:55 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ICSA '13 conference proceedings.",
  abstract =     "Modern ``warehouse scale computers'' (WSCs) continue
                 to be embraced as homogeneous computing platforms.
                 However, due to frequent machine replacements and
                 upgrades, modern WSCs are in fact composed of diverse
                 commodity microarchitectures and machine
                 configurations. Yet, current WSCs are architected with
                 the assumption of homogeneity, leaving a potentially
                 significant performance opportunity unexplored. In this
                 paper, we expose and quantify the performance impact of
                 the ``homogeneity assumption'' for modern production
                 WSCs using industry-strength large-scale web-service
                 workloads. In addition, we argue for, and evaluate the
                 benefits of, a heterogeneity-aware WSC using commercial
                 web-service production workloads including Google's
                 web-search. We also identify key factors impacting the
                 available performance opportunity when exploiting
                 heterogeneity and introduce a new metric, opportunity
                 factor, to quantify an application's sensitivity to the
                 heterogeneity in a given WSC. To exploit heterogeneity
                 in ``homogeneous'' WSCs, we propose ``Whare-Map,'' the
                 W{\sc H}eterogeneity Aw{\sc are Mapper} that leverages
                 already in-place continuous profiling subsystems found
                 in production environments. When employing
                 ``Whare-Map'', we observe a cluster-wide performance
                 improvement of 15\% on average over heterogeneity ---
                 oblivious job placement and up to an 80\% improvement
                 for web-service applications that are particularly
                 sensitive to heterogeneity.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Foutris:2013:DMA,
  author =       "Nikos Foutris and Dimitris Gizopoulos and Xavier Vera
                 and Antonio Gonzalez",
  title =        "Deconfigurable microprocessor architectures for
                 silicon debug acceleration",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "3",
  pages =        "631--642",
  month =        jun,
  year =         "2013",
  DOI =          "https://doi.org/10.1145/2508148.2485976",
  bibdate =      "Sat Jul 27 06:58:55 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ICSA '13 conference proceedings.",
  abstract =     "The share of silicon debug in the overall
                 microprocessor chips development cycle is rapidly
                 expanding due to the ever growing design complexity and
                 the limited efficiency of pre-silicon validation
                 methods. Massive application of short random test
                 programs on the prototype microprocessor chips is one
                 of the most effective parts of silicon debug. However,
                 a major bottleneck and source of ``noise'' in this
                 phase is that large numbers of random test programs
                 fail due to the same or similar design bugs. This
                 redundant behavior adds long delays in the debug flow
                 since each failing random program must be separately
                 examined, although it does not usually bring new debug
                 information. The development of effective techniques
                 that detect dominant modes of failure among random
                 programs and triage them into common categories
                 eliminate redundant debug sessions and significantly
                 boost silicon debug. We propose the employment of
                 deconfigurable microprocessor architectures along with
                 self-checking random test programs to reduce the
                 redundant debug sessions and make the triage step of
                 silicon debug more efficient. Several hardware
                 components of high performance microprocessor
                 micro-architectures can be deconfigured while keeping
                 the functional completeness of the design. This is the
                 property we exploit in our silicon debug methodology
                 for the triaging of random test programs. We support
                 our methodology by a hardware mechanism dedicated to
                 silicon debug that groups the failing test programs
                 into categories depending on the microprocessor
                 hardware components that need to be deconfigured for a
                 random test program to be correctly executed. Identical
                 deconfiguration sequences for multiple test programs
                 indicate the existence of redundancy among them and
                 group them together. This grouping significantly
                 reduces the number of failing tests that must be
                 debugged afterwards. Detailed evaluation of the method
                 on an x86 microprocessor demonstrates its efficiency in
                 reducing the debug sessions and thus in accelerating
                 silicon debug.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Pokam:2013:QPI,
  author =       "Gilles Pokam and Klaus Danne and Cristiano Pereira and
                 Rolf Kassa and Tim Kranich and Shiliang Hu and Justin
                 Gottschlich and Nima Honarmand and Nathan Dautenhahn
                 and Samuel T. King and Josep Torrellas",
  title =        "{QuickRec}: prototyping an {Intel} architecture
                 extension for record and replay of multithreaded
                 programs",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "3",
  pages =        "643--654",
  month =        jun,
  year =         "2013",
  DOI =          "https://doi.org/10.1145/2508148.2485977",
  bibdate =      "Sat Jul 27 06:58:55 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ICSA '13 conference proceedings.",
  abstract =     "There has been significant interest in
                 hardware-assisted deterministic Record and Replay (RnR)
                 systems for multithreaded programs on multiprocessors.
                 However, no proposal has implemented this technique in
                 a hardware prototype with full operating system
                 support. Such an implementation is needed to assess RnR
                 practicality. This paper presents QuickRec, the first
                 multicore Intel Architecture (IA) prototype of RnR for
                 multithreaded programs. QuickRec is based on QuickIA,
                 an Intel emulation platform for rapid prototyping of
                 new IA extensions. QuickRec is composed of a Xeon
                 server platform with FPGA-emulated second-generation
                 Pentium cores, and Capo3, a full software stack for
                 managing the recording hardware from within a modified
                 Linux kernel. This paper's focus is understanding and
                 evaluating the implementation issues of RnR on a real
                 platform. Our effort leads to some lessons learned, as
                 well as to some pointers for future research. We
                 demonstrate that RnR can be implemented efficiently on
                 a real multicore IA system. In particular, we show that
                 the rate of memory log generation is insignificant, and
                 that the recording hardware has negligible performance
                 overhead. However, the software stack incurs an average
                 recording overhead of nearly 13\%, which must be
                 reduced to enable always-on use of RnR.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Huang:2013:NRC,
  author =       "Ruirui Huang and Erik Halberg and G. Edward Suh",
  title =        "Non-race concurrency bug detection through
                 order-sensitive critical sections",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "3",
  pages =        "655--666",
  month =        jun,
  year =         "2013",
  DOI =          "https://doi.org/10.1145/2508148.2485978",
  bibdate =      "Sat Jul 27 06:58:55 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ICSA '13 conference proceedings.",
  abstract =     "This paper introduces a new heuristic condition for
                 non-race concurrency bugs, named order-sensitive
                 critical sections, and proposes a run-time bug
                 detection scheme based on the condition. The
                 order-sensitive critical sections are defined as a pair
                 of critical sections that can lead to non-deterministic
                 shared memory state depending on the order in which
                 they execute. In a sense, the order-sensitive critical
                 sections can be seen as extending the intuition in
                 using data races as a potential bug condition to
                 capture non-race bugs. Experiments show that the
                 proposed scheme provides a good coverage for multiple
                 types of non-race bugs, with a small number of false
                 positives. For example, the scheme detected all 9
                 real-world non-race bugs that were tested as well as
                 over 90\% of injected non-race bugs. Additionally, this
                 paper presents an efficient hardware architecture that
                 supports the proposed scheme with minor hardware
                 changes and a small amount of additional state --- a
                 9-KB buffer per core and a 1-bit tag per data cache
                 block. The hardware-based scheme could still detect all
                 9 real-world bugs that were tested and more than 84\%
                 of the injected non-race bugs. Moreover, the hardware
                 supported scheme has a negligible impact on
                 performance, with a 0.23\% slowdown on average.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Maitra:2013:HEM,
  author =       "Subhashis Maitra and Amitabha Sinha",
  title =        "High efficiency {MAC} unit used in digital signal
                 processing and elliptic curve cryptography",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "4",
  pages =        "1--7",
  month =        sep,
  year =         "2013",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2560488.2560490",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 2 17:25:55 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Computational complexities of different Algorithms to
                 enhance the speed of response of digital signal
                 processor and different cryptographic analysis are the
                 important issues for the current researcher.
                 Computational complexities means hardware complexities
                 and timing complexities. Both the complexities depend
                 on the design of the software and hardware. Arithmetic
                 computation like addition and multiplication are the
                 major parts in designing processor that helps to
                 improve the efficiency and to reduce complexities.
                 Hence the design of a multiplier unit is the major
                 issue to the current researchers. There are different
                 multiplication algorithms discussed in different
                 research materials. In this paper, a new algorithm for
                 multiplication has been proposed to enhance the speed
                 of operation and to reduce hardware complexities. Also
                 a comparative study of the proposed algorithm over
                 different existing algorithms has been explained here
                 along with VHDL model of the proposed architecture.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Janjusic:2013:GMP,
  author =       "Tomislav Janjusic and Krishna Kavi",
  title =        "{Gleipnir}: a memory profiling and tracing tool",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "4",
  pages =        "8--12",
  month =        sep,
  year =         "2013",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2560488.2560491",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 2 17:25:55 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "In this article we describe a memory tracing and
                 profiling tool called Gleipnir. Gleipnir is a plug-in
                 tool for a widely used binary instrumentation
                 framework, Valgrind. Gleipnir's ability to collect fine
                 grained memory traces and associate each access to
                 source level data structures and elements of these
                 structures, makes it a good candidate tool for advanced
                 memory analysis and studying complex memory access
                 patterns. The data provided by Gleipnir may be used by
                 cache simulators to analyze accesses to data structure
                 elements and understand the dynamic memory behavior of
                 programs. The goal of Gleipnir is to give the
                 programmer aid in refactoring data and code. In
                 addition to Gleipnir we introduce a cache simulation
                 tool, Gl cSim. Gl cSim is an extension to DineroIV (a
                 uni-processor simulator) that tracks Gleipnir provided
                 trace and debug-information.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Thorson:2013:INb,
  author =       "Mark Thorson",
  title =        "{Internet} nuggets",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "4",
  pages =        "13--22",
  month =        sep,
  year =         "2013",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2560488.2560493",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 2 17:25:55 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Godard:2013:MSS,
  author =       "Ivan Godard",
  title =        "The {Mill}: split-stream encoding",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "5",
  pages =        "1--5",
  month =        dec,
  year =         "2013",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2641361.2641363",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Aug 18 17:12:43 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Real-world programs often thrash in the instruction
                 cache, especially when SMT methods are used. The MillTM
                 split-stream encoding doubles the effective capacity of
                 the instruction cache at no increase in per-instruction
                 power usage or cache access latency, while also sharply
                 increasing the potential maximal decode rate for
                 instruction sets that use variable-length encoding.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Thomasian:2013:DAM,
  author =       "Alexander Thomasian",
  title =        "Disk arrays with multiple {RAID} levels",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "5",
  pages =        "6--24",
  month =        dec,
  year =         "2013",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2641361.2641364",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Aug 18 17:12:43 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "We propose Heterogeneous Disk Arrays (HDAs), which
                 allow multiple RAID levels for database applications to
                 coexist in a single disk array accommodating multiple
                 RAID levels. Our main concern is to efficiently utilize
                 disk bandwidth and capacity, while balancing disk loads
                 in a cloud storage environment, however, a small number
                 of disks is considered in this study for illustrative
                 purposes. Individual RAID levels are adjusted to data
                 availability requirements and workload demands.
                 Adopting the most stringent availability requirements
                 for all datasets would incur unnecessarily high
                 bandwidth overhead for updating datasets, which do not
                 have this requirement. Intermixing RAID levels is
                 beneficial from the viewpoint of balancing disk loads,
                 similarly to the striping paradigm in RAID5. The
                 suitability of the RAID levels varies with database
                 applications: RAID5 --- reading/writing large datasets
                 for data mining and warehousing, RAID1 -high
                 performance OLTP applications. Several single pass data
                 allocation methods are proposed in this paper and
                 compared using synthetically generated allocation
                 requests in associated papers.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Maitra:2013:DSM,
  author =       "Subhashis Maitra and Amitabha Sinha",
  title =        "Design and simulation of {MAC} unit using
                 combinational circuit and adder",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "5",
  pages =        "25--33",
  month =        dec,
  year =         "2013",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2641361.2641365",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Aug 18 17:12:43 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/fparith.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Hardware and timing complexities of MAC unit to
                 perform arithmetic operation like addition or
                 multiplication especially in the field of Digital
                 Signal Processing (DSP) or Elliptic Curve Cryptography
                 (ECC) are the major issues to the designer. The
                 multiplication operation is essential and abundant in
                 DSP Applications. In order to achieve maximum
                 implementation efficiency and timing performance,
                 designing a DSP systems is critical and frequently
                 presents a significant challenge to hardware engineers.
                 There are certain multipliers that simplify this
                 challenge by abstracting away FPGA device specifics,
                 while maintaining the required maximum performance and
                 resource efficiency. These multipliers are able to
                 perform parallel multiplication and hence constant
                 coefficient multiplication, both with differing
                 implementation styles. Again with the aid of
                 instantaneous resource estimation, hardware engineers
                 can rapidly select the optimal solution for their
                 system. The latest additions to the IP provide fine
                 control over the latency using the concept of
                 pipelining of the multipliers that are purely
                 combinatorial to be fully pipelined. Here a new
                 compensation method that reduces both the hardware and
                 timing complexities of the multiplier used for DSP
                 application or ECC application has been proposed. The
                 design of the MAC unit based on the proposed
                 compensation method has been dealt here properly using
                 Xilinx 13.2 and compared with array multiplier, Booth
                 multiplier and Vedic multiplier to show its novelty
                 over them. The hardware complexity is reduced to about
                 60\% of the original multiplier. Design results show
                 that the proposed architecture has lower hardware
                 overhead, lower error and fast operating speed as
                 compared with array, Booth and Vedic multiplier.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Chau:2013:ASM,
  author =       "Thomas C. P. Chau and James S. Targett and Marlon
                 Wijeyasinghe and Wayne Luk and Peter Y. K. Cheung and
                 Benjamin Cope and Alison Eele and Jan Maciejowski",
  title =        "Accelerating sequential {Monte Carlo} method for
                 real-time air traffic management",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "5",
  pages =        "35--40",
  month =        dec,
  year =         "2013",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2641361.2641367",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Aug 18 17:12:43 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "This paper presents how field-programmable gate arrays
                 (FPGAs) are used to accelerate the Sequential Monte
                 Carlo method for air traffic management. A novel data
                 structure is introduced for a particle stream that
                 enables efficient evaluation of constraints and
                 weights. A parallel implementation for this streaming
                 data structure is designed, and an analytical model is
                 provided for estimating the performance and resource
                 usage of our implementation. We compare our design to
                 implementations on CPU and GPU. We show 9.3 times speed
                 up and 89 times improvement in energy efficiency over
                 an Intel Core i7-950 CPU with 8 threads and demonstrate
                 1.3 times speed up and 13.5 times improvement in energy
                 efficiency over an NVIDIA Tesla C2070 GPU with 448
                 cores. We also estimate the performance of FPGA in
                 future scenario and show that FPGA is able to control
                 15 times and 2.8 times more aircraft than CPU and GPU
                 in real-time respectively.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Mahram:2013:NBC,
  author =       "Atabak Mahram and Martin C. Herbordt",
  title =        "{NCBI BLASTP} on the {Convey HC1-EX}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "5",
  pages =        "41--46",
  month =        dec,
  year =         "2013",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2641361.2641368",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Aug 18 17:12:43 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "The BLAST sequence alignment program is a central
                 application in bioinformatics. The de facto standard
                 version, NCBI BLAST, uses complex heuristics which make
                 it challenging to simultaneously achieve both high
                 performance and exact agreement. In previous work, a
                 system that used novel FPGA-based filters reduced the
                 input database by over 99.97\% without loss of
                 sensitivity. In the present work we report experiences
                 in getting from a prototype to a potential product for
                 the Convey HC1-EX. There are several issues. The first
                 is the efforts made to maintain timing for a highly
                 complex configuration as it is optimized by including
                 additional filter stages. This requires implementation
                 and optimization of new interface logic as well as
                 floor-planning. The second is the system-level
                 tradeoffs necessary to maintain correctness. The issue
                 here is preventing low frequency events, which
                 necessarily cannot be mapped to the FPGA, from diluting
                 the performance benefits without sacrificing
                 sensitivity. We present results for various usage
                 scenarios and find a factor of nearly 5x speed-up over
                 a fully parallel implementation of the reference code
                 on a contemporaneous CPU. We believe that the resulting
                 system is the leading accelerated NCBI BLAST. The
                 significance of this work is that, while such in-depth
                 work is necessary to achieve high performance for
                 complex systems, these issues are rarely described in
                 the academic literature.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Sano:2013:ECC,
  author =       "Kentaro Sano and Yoshiaki Kono and Hayato Suzuki and
                 Ryotaro Chiba and Ryo Ito and Tomohiro Ueno and Kyo
                 Koizumi and Satoru Yamamoto",
  title =        "Efficient custom computing of fully-streamed lattice
                 {Boltzmann} method on tightly-coupled {FPGA} cluster",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "5",
  pages =        "47--52",
  month =        dec,
  year =         "2013",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2641361.2641369",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Aug 18 17:12:43 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "This paper presents the detailed design of a custom
                 computing machine for fully-streamed LBM computation on
                 multiple FPGAs, and evaluates its efficiency with
                 prototype implementation. We design a unit for
                 completely streamed computation including boundary
                 treatment with a newly introduced cell attribute.
                 Experimental results demonstrate that the proposed
                 machine achieves high utilization of PEs, 99 \% of the
                 peak performance, for one and two FPGAs computing a
                 large lattice. This is due to our fully-streamed design
                 to allow all arithmetic units to be efficiently
                 utilized with a constant memory bandwidth, and the
                 architecture to exploit a low-latency accelerator
                 domain network (ADN) of a tightly-coupled FPGA cluster
                 for scalable computation.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Vanderbauwhede:2013:HCF,
  author =       "Wim Vanderbauwhede and Anton Frolov and Sai Rahul
                 Chalamalasetti and Martin Margala",
  title =        "A hybrid {CPU--FPGA} system for high throughput
                 {(10Gb/s)} streaming document classification",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "5",
  pages =        "53--58",
  month =        dec,
  year =         "2013",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2641361.2641370",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Aug 18 17:12:43 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Processing large volumes of information in real time
                 requires large amounts of computational power, which
                 consumes a significant amount of energy. With the rise
                 in the amount of data produced, energy-efficient
                 high-performance information processing systems are
                 becoming a necessity. We present a hybrid CPU-FPGA
                 system for high-throughput classification of streams of
                 textual documents (e.g. emails or web pages). The
                 current system parses the document stream using a
                 multicore CPU and performs classification on the parsed
                 stream using Field-Programmable Gate Arrays (FPGAs). As
                 an example, we demonstrate a Naive Bayes classifier on
                 the TREC Aquaint dataset. Our current solution can
                 classify 10Gb/s internet traffic in real time. Our aim
                 is to increase the throughput to 100Gb/s by
                 incorporating the parser into the FPGA design.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Guo:2013:CPE,
  author =       "Ce Guo and Wayne Luk and Ekaterina Vinkovskaya and
                 Rama Cont",
  title =        "Customisable pipelined engine for intensity evaluation
                 in multivariate {Hawkes} point processes",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "5",
  pages =        "59--64",
  month =        dec,
  year =         "2013",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2641361.2641371",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Aug 18 17:12:43 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Hawkes processes are point processes that can be used
                 to build probabilistic models to capture occurrence
                 patterns of random events. They are widely used in
                 high-frequency trading, seismic analysis and
                 neuroscience. A critical calculation in Hawkes process
                 models is intensity evaluation. The intensity of a
                 point process represents the instantaneous rate of
                 occurrence of events, but it is computationally
                 expensive and challenging to calculate efficiently in
                 order to make predictions using Hawkes process models.
                 To accelerate the computation, we analyse data
                 dependency in the intensity evaluation routine, and
                 present a strategy to enable multiple intensities to be
                 computed with a single pass through the data. We then
                 design and optimise a pipelined hardware engine based
                 on our strategy. In our experiments, an FPGA-based
                 implementation of the proposed engine is evaluated by
                 four case studies. This implementation achieves up to
                 94 times speedup over an optimised CPU implementation
                 with one core, and 12 times speedup over a CPU with
                 eight cores.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Giefers:2013:AFD,
  author =       "Heiner Giefers and Christian Plessl and Jens
                 F{\"o}rstner",
  title =        "Accelerating finite difference time domain simulations
                 with reconfigurable dataflow computers",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "5",
  pages =        "65--70",
  month =        dec,
  year =         "2013",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2641361.2641372",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Aug 18 17:12:43 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Finite difference methods are widely used, highly
                 parallel algorithms for solving differential equations.
                 However, the algorithms are memory bound and thus
                 difficult to implement efficiently on CPUs or GPUs. In
                 this work we study the implementation of the finite
                 difference time domain (FDTD) method for solving
                 Maxwell's equations on an FPGA-based Maxeler dataflow
                 computer. We evaluate our work with actual problems
                 from the domain of computational nanophotonics. The use
                 of realistic simulations requires us to pay special
                 attention to boundary conditions (Dirichlet, periodic,
                 absorbing), which are critical for the correctness of
                 results but detrimental to the performance and thus
                 frequently neglected. We discuss and evaluate the
                 design of two different FDTD implementations, which
                 outperform CPU and GPU implementations. To our
                 knowledge, our implementation is the fastest FPGA-based
                 FDTD solver.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ogawa:2013:RJA,
  author =       "Yuki Ogawa and Masahiro Iida and Motoki Amagasaki and
                 Morihiro Kuga and Toshinori Sueyoshi",
  title =        "A reconfigurable {Java} accelerator with software
                 compatibility for embedded systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "5",
  pages =        "71--76",
  month =        dec,
  year =         "2013",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2641361.2641373",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Aug 18 17:12:43 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ohkawa:2013:RHO,
  author =       "Takeshi Ohkawa and Daichi Uetake and Takashi Yokota
                 and Kanemitsu Ootsu and Takanobu Baba",
  title =        "Reconfigurable and hardwired {ORB} engine on {FPGA} by
                 {Java-to-HDL} synthesizer for realtime application",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "5",
  pages =        "77--82",
  month =        dec,
  year =         "2013",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2641361.2641374",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Aug 18 17:12:43 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "A platform for networked FPGA system design, which is
                 named ``ORB Engine'', is proposed to add more
                 controllability and design productivity on FPGA-based
                 systems composed of software and hardwired IPs. A
                 developer can define an object-oriented interface for
                 the circuit IP in FPGA, and implement the control
                 sequence part using Java. The circuit IP in FPGA can be
                 handled through object-oriented interface from variety
                 of programming languages like C++, Java, Python, Ruby
                 and so on. Application specific and high-efficiency
                 circuit for ORB (Object Request Broker) protocol
                 processing is synthesized from easy-handling Java code
                 using JavaRock Java-to-HDL synthesizer within the
                 de-facto standard CORBA (Common Object Request Broker
                 Architecture). The measurement result shows a very low
                 latency as low as 200us of UDP/IP packet in/out and
                 exhibits a fluctuation free delay performance, which is
                 desirable for real-time applications.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{deDinechin:2013:FPT,
  author =       "Florent de Dinechin and Matei Istoan and Guillaume
                 Sergent",
  title =        "Fixed-point trigonometric functions on {FPGAs}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "5",
  pages =        "83--88",
  month =        dec,
  year =         "2013",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2641361.2641375",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Aug 18 17:12:43 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/elefunt.bib;
                 https://www.math.utah.edu/pub/tex/bib/fparith.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Three approaches for computing sines and cosines on
                 FPGAs are studied in this paper, with a focus of
                 high-throughput pipelined architecture, and
                 state-of-the-art implementation techniques. The first
                 approach is the classical CORDIC iteration, for which
                 we suggest a reduced iteration technique and fine
                 optimizations in datapath width and latency. The second
                 is an ad-hoc architecture specifically designed around
                 trigonometric identities. The third uses a generic
                 table- and DSP-based polynomial approximator. These
                 three architectures are implemented and compared in the
                 FloPoCo framework.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Tada:2013:PED,
  author =       "Jubee Tada",
  title =        "Performance evaluation of {$3$-D} stacked $ 32$-bit
                 parallel multipliers",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "5",
  pages =        "89--94",
  month =        dec,
  year =         "2013",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2641361.2641376",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Aug 18 17:12:43 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Conventional two-dimensional (2-D) implementation
                 technologies face certain limitations; to overcome
                 these limitations, three-dimensional (3-D) integration
                 technologies have been developed. There has been a
                 focus on circuit partitioning strategies because they
                 play an important role in exploiting the potential of
                 3-D stacked circuits. The Middle-Grain circuit
                 partitioning strategy has been proposed to exploit the
                 potential of 3-D stacked circuits. The proposed
                 strategy equalizes the area of each layer and avoids
                 the critical paths across different layers as much as
                 possible. In this study, 3-D stacked parallel
                 multipliers are designed using various circuit
                 partitioning strategies. Experimental results
                 demonstrate that the 3-D stacked 32-bit parallel
                 multiplier, designed using the proposed strategy,
                 achieves a 27\% delay reduction as compared to the 2-D
                 implementation.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Tanaka:2013:USP,
  author =       "Yuichiroh Tanaka and Shimpei Sato and Kenji Kise",
  title =        "The {UltraSmall} soft processor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "5",
  pages =        "95--100",
  month =        dec,
  year =         "2013",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2641361.2641377",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Aug 18 17:12:43 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "A soft processor is a processor that is implemented
                 using logic synthesis mainly targeting programmable
                 logic device like FPGA and it becomes a common
                 component for FPGA designs. The supersmall soft
                 processor (small-core) developed at University of
                 Toronto is a unique soft processor because its main
                 concern is very low hardware cost while supporting
                 32-bit ISA. With the same concept as small-core, we are
                 developing the ultrasmall soft processor (UltraSmall)
                 based on smallcore. The goal of this project is to
                 implement the smallest 32-bit ISA soft processor while
                 aiming to achieve high performance. We propose
                 UltraSmall and describe its key ideas and
                 implementations. The evaluation results indicate that
                 the hardware cost of UltraSmall is smaller than
                 smallcore in the latest FPGA while achieving 1.8x
                 performance of small-core.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Guo:2013:CAS,
  author =       "Liucheng Guo and David B. Thomas and Wayne Luk",
  title =        "Customisable architectures for the set covering
                 problem",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "5",
  pages =        "101--106",
  month =        dec,
  year =         "2013",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2641361.2641378",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Aug 18 17:12:43 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "This paper proposes novel customisable streaming
                 architectures for the NP-hard set covering problem. Our
                 approach covers both exhaustive and genetic algorithms,
                 supporting coarse-grain parallelism and deep pipelines
                 while allowing trade-offs between performance and
                 resource usage. Experiments targeting Maxeler systems
                 show that our FPGA-based designs are more effective
                 than the corresponding multicore software versions. The
                 speed up of the exhaustive algorithm exceeds 250 times,
                 and that of the genetic algorithm exceeds 60 times.
                 Meanwhile, our implementations are more flexible than
                 other FPGA solutions, allowing users to customise
                 parameters at run time without recompilation.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Plumbridge:2013:BPR,
  author =       "Gary Plumbridge and Jack Whitham and Neil Audsley",
  title =        "{Blueshell}: a platform for rapid prototyping of
                 multiprocessor {NoCs} and accelerators",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "5",
  pages =        "107--117",
  month =        dec,
  year =         "2013",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2641361.2641379",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Aug 18 17:12:43 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "The rapid increase in FPGA logic capacity has enabled
                 the prototyping of multiprocessor Network-on-Chip (NoC)
                 architectures. However, the design space exploration of
                 these complex architectures is highly time consuming
                 with traditional methodologies for FPGA design. Our
                 paper addresses the challenges of multiprocessor
                 network design with the Blueshell framework for
                 generating multiprocessor networks on chip (NoC) and a
                 coupled Java software stack, Network-Chi. With
                 Blueshell hardware is constructed from high-level
                 components including processors and routers using
                 concise Bluespec System Verilog. The Network-Chi
                 software framework is also presented to enable
                 programming the on-chip processors in a familiar Java
                 style and without exposing the low-level systems
                 programming to the application designer. We demonstrate
                 that Blueshell systems with as many as 20 processors
                 can be implemented on a modestly sized FPGA.
                 Performance figures for a selection of distributed
                 applications are also provided.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hong:2013:RTR,
  author =       "Chuan Hong and Khaled Benkrid and Nazrin Isa and
                 Xabier Iturbe",
  title =        "A run-time reconfigurable system for adaptive high
                 performance efficient computing",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "5",
  pages =        "113--118",
  month =        dec,
  year =         "2013",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2641361.2641380",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Aug 18 17:12:43 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Field programmable hardware gives electronic systems
                 the ability to be reconfigured at run time. This allows
                 electronic systems to be more efficiently customized on
                 demand and on-the-fly depending on user requirements
                 and environmental changes. This paper presents a
                 run-time reconfigurable system that allows computing
                 tasks to adjust their sizes in response to current
                 available resources, optimizing the overall performance
                 by maximally exploiting all the resources on the chip.
                 In particular, we present a novel run-time task
                 assembler, which assembles tasks with desired
                 parameters on-the-fly, together with an efficacious
                 run-time task placer to rapidly configure tasks at
                 optimum locations. The system is demonstrated with a
                 dynamic programming-based pairwise sequence alignment
                 application. Real hardware implementation result shows
                 that our run-time reconfigurable system optimizes
                 resource usage on the fly by ~ 3x, while matching the
                 performance of carefully hand-crafted static
                 solutions.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Thorson:2013:INc,
  author =       "Mark Thorson",
  title =        "{Internet} nuggets",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "41",
  number =       "5",
  pages =        "119--127",
  month =        dec,
  year =         "2013",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2641361.2641382",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Aug 18 17:12:43 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Davis:2014:IWA,
  author =       "Al Davis",
  title =        "Inside {Windows Azure}: the challenges and
                 opportunities of a cloud operating system",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "1",
  pages =        "1--2",
  month =        mar,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2654822.2560008",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:12:13 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Cloud operating systems provide on-demand, scalable
                 compute and storage resources. They allow service
                 developers to focus on their business logic by
                 simplifying many portions of their service, including
                 resource management, provisioning, monitoring, and
                 application lifecycle management. This talk describes
                 some of the technical challenges faced, as well as
                 emergent opportunities created, by Microsoft's cloud
                 operating system Windows Azure.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS '14 conference proceedings.",
}

@Article{Novakovic:2014:SN,
  author =       "Stanko Novakovic and Alexandros Daglis and Edouard
                 Bugnion and Babak Falsafi and Boris Grot",
  title =        "Scale-out {NUMA}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "1",
  pages =        "3--18",
  month =        mar,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2654822.2541965",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:12:13 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Emerging datacenter applications operate on vast
                 datasets that are kept in DRAM to minimize latency. The
                 large number of servers needed to accommodate this
                 massive memory footprint requires frequent
                 server-to-server communication in applications such as
                 key-value stores and graph-based applications that rely
                 on large irregular data structures. The fine-grained
                 nature of the accesses is a poor match to commodity
                 networking technologies, including RDMA, which incur
                 delays of 10--1000$ \times $ over local DRAM
                 operations. We introduce Scale-Out NUMA (soNUMA) --- an
                 architecture, programming model, and communication
                 protocol for low-latency, distributed in-memory
                 processing. soNUMA layers an RDMA-inspired programming
                 model directly on top of a NUMA memory fabric via a
                 stateless messaging protocol. To facilitate
                 interactions between the application, OS, and the
                 fabric, soNUMA relies on the remote memory controller a
                 new architecturally-exposed hardware block integrated
                 into the node's local coherence hierarchy. Our results
                 based on cycle-accurate full-system simulation show
                 that soNUMA performs remote reads at latencies that are
                 within 4$ \times $ of local DRAM, can fully utilize the
                 available memory bandwidth, and can issue up to 10M
                 remote memory operations per second per core.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS '14 conference proceedings.",
}

@Article{Agrawal:2014:RHD,
  author =       "Sandeep R. Agrawal and Valentin Pistol and Jun Pang
                 and John Tran and David Tarjan and Alvin R. Lebeck",
  title =        "{Rhythm}: harnessing data parallel hardware for server
                 workloads",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "1",
  pages =        "19--34",
  month =        mar,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2654822.2541956",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:12:13 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Trends in increasing web traffic demand an increase in
                 server throughput while preserving energy efficiency
                 and total cost of ownership. Present work in optimizing
                 data center efficiency primarily focuses on the data
                 center as a whole, using off-the-shelf hardware for
                 individual servers. Server capacity is typically
                 increased by adding more machines, which is cheap,
                 though inefficient in the long run in terms of energy
                 and area. Our work builds on the observation that
                 server workload execution patterns are not completely
                 unique across multiple requests. We present a
                 framework---called Rhythm---for high throughput servers
                 that can exploit similarity across requests to improve
                 server performance and power/energy efficiency by
                 launching data parallel executions for request cohorts.
                 An implementation of the SPECWeb Banking workload using
                 Rhythm on NVIDIA GPUs provides a basis for evaluating
                 both software and hardware for future cohort-based
                 servers. Our evaluation of Rhythm on future server
                 platforms shows that it achieves 4x the throughput
                 (reqs/sec) of a core i7 at efficiencies (reqs/Joule)
                 comparable to a dual core ARM Cortex A9. A Rhythm
                 implementation that generates transposed responses
                 achieves 8x the i7 throughput while processing 2.5x
                 more requests/Joule compared to the A9.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS '14 conference proceedings.",
}

@Article{Samadi:2014:PPB,
  author =       "Mehrzad Samadi and Davoud Anoushe Jamshidi and
                 Janghaeng Lee and Scott Mahlke",
  title =        "{Paraprox}: pattern-based approximation for data
                 parallel applications",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "1",
  pages =        "35--50",
  month =        mar,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2654822.2541948",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:12:13 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Approximate computing is an approach where reduced
                 accuracy of results is traded off for increased speed,
                 throughput, or both. Loss of accuracy is not
                 permissible in all computing domains, but there are a
                 growing number of data-intensive domains where the
                 output of programs need not be perfectly correct to
                 provide useful results or even noticeable differences
                 to the end user. These soft domains include multimedia
                 processing, machine learning, and data mining/analysis.
                 An important challenge with approximate computing is
                 transparency to insulate both software and hardware
                 developers from the time, cost, and difficulty of using
                 approximation. This paper proposes a software-only
                 system, Paraprox, for realizing transparent
                 approximation of data-parallel programs that operates
                 on commodity hardware systems. Paraprox starts with a
                 data-parallel kernel implemented using OpenCL or CUDA
                 and creates a parameterized approximate kernel that is
                 tuned at runtime to maximize performance subject to a
                 target output quality (TOQ) that is supplied by the
                 user. Approximate kernels are created by recognizing
                 common computation idioms found in data-parallel
                 programs (e.g., Map, Scatter/Gather, Reduction, Scan,
                 Stencil, and Partition) and substituting approximate
                 implementations in their place. Across a set of 13 soft
                 data-parallel applications with at most 10\% quality
                 degradation, Paraprox yields an average performance
                 gain of 2.7x on a NVIDIA GTX 560 GPU and 2.5x on an
                 Intel Core i7 quad-core processor compared to accurate
                 execution on each platform.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS '14 conference proceedings.",
}

@Article{Bornholt:2014:UFO,
  author =       "James Bornholt and Todd Mytkowicz and Kathryn S.
                 McKinley",
  title =        "{Uncertain$<$ t$>$}: a first-order type for uncertain
                 data",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "1",
  pages =        "51--66",
  month =        mar,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2654822.2541958",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:12:13 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Emerging applications increasingly use estimates such
                 as sensor data (GPS), probabilistic models, machine
                 learning, big data, and human data. Unfortunately,
                 representing this uncertain data with discrete types
                 (floats, integers, and booleans) encourages developers
                 to pretend it is not probabilistic, which causes three
                 types of uncertainty bugs. (1) Using estimates as facts
                 ignores random error in estimates. (2) Computation
                 compounds that error. (3) Boolean questions on
                 probabilistic data induce false positives and
                 negatives. This paper introduces Uncertain Whereas
                 previous probabilistic programming languages focus on
                 experts, Uncertain",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS '14 conference proceedings.",
}

@Article{Santos:2014:UAT,
  author =       "Nuno Santos and Himanshu Raj and Stefan Saroiu and
                 Alec Wolman",
  title =        "Using {ARM} trustzone to build a trusted language
                 runtime for mobile applications",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "1",
  pages =        "67--80",
  month =        mar,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2654822.2541949",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:12:13 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "This paper presents the design, implementation, and
                 evaluation of the Trusted Language Runtime (TLR), a
                 system that protects the confidentiality and integrity
                 of .NET mobile applications from OS security breaches.
                 TLR enables separating an application's
                 security-sensitive logic from the rest of the
                 application, and isolates it from the OS and other
                 apps. TLR provides runtime support for the secure
                 component based on a .NET implementation for embedded
                 devices. TLR reduces the TCB of an open source .NET
                 implementation by a factor of $ 78 $ with a tolerable
                 performance cost. The main benefit of the TLR is to
                 bring the developer benefits of managed code to trusted
                 computing. With the TLR, developers can build their
                 trusted components with the productivity benefits of
                 modern high level languages, such as strong typing and
                 garbage collection.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS '14 conference proceedings.",
}

@Article{Criswell:2014:VGP,
  author =       "John Criswell and Nathan Dautenhahn and Vikram Adve",
  title =        "{Virtual Ghost}: protecting applications from hostile
                 operating systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "1",
  pages =        "81--96",
  month =        mar,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2654822.2541986",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:12:13 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Applications that process sensitive data can be
                 carefully designed and validated to be difficult to
                 attack, but they are usually run on monolithic,
                 commodity operating systems, which may be less secure.
                 An OS compromise gives the attacker complete access to
                 all of an application's data, regardless of how well
                 the application is built. We propose a new system,
                 Virtual Ghost, that protects applications from a
                 compromised or even hostile OS. Virtual Ghost is the
                 first system to do so by combining compiler
                 instrumentation and run-time checks on operating system
                 code, which it uses to create ghost memory that the
                 operating system cannot read or write. Virtual Ghost
                 interposes a thin hardware abstraction layer between
                 the kernel and the hardware that provides a set of
                 operations that the kernel must use to manipulate
                 hardware, and provides a few trusted services for
                 secure applications such as ghost memory management,
                 encryption and signing services, and key management.
                 Unlike previous solutions, Virtual Ghost does not use a
                 higher privilege level than the kernel. Virtual Ghost
                 performs well compared to previous approaches; it
                 outperforms InkTag on five out of seven of the LMBench
                 microbenchmarks with improvements between 1.3x and
                 14.3x. For network downloads, Virtual Ghost experiences
                 a 45\% reduction in bandwidth at most for small files
                 and nearly no reduction in bandwidth for large files
                 and web traffic. An application we modified to use
                 ghost memory shows a maximum additional overhead of 5\%
                 due to the Virtual Ghost protections. We also
                 demonstrate Virtual Ghost's efficacy by showing how it
                 defeats sophisticated rootkit attacks.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS '14 conference proceedings.",
}

@Article{Li:2014:SLH,
  author =       "Xun Li and Vineeth Kashyap and Jason K. Oberg and
                 Mohit Tiwari and Vasanth Ram Rajarathinam and Ryan
                 Kastner and Timothy Sherwood and Ben Hardekopf and
                 Frederic T. Chong",
  title =        "{Sapper}: a language for hardware-level security
                 policy enforcement",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "1",
  pages =        "97--112",
  month =        mar,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2654822.2541947",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:12:13 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Privacy and integrity are important security concerns.
                 These concerns are addressed by controlling information
                 flow, i.e., restricting how information can flow
                 through a system. Most proposed systems that restrict
                 information flow make the implicit assumption that the
                 hardware used by the system is fully ``correct'' and
                 that the hardware's instruction set accurately
                 describes its behavior in all circumstances. The truth
                 is more complicated: modern hardware designs defy
                 complete verification; many aspects of the timing and
                 ordering of events are left totally unspecified; and
                 implementation bugs present themselves with surprising
                 frequency. In this work we describe Sapper, a novel
                 hardware description language for designing
                 security-critical hardware components. Sapper seeks to
                 address these problems by using static analysis at
                 compile-time to automatically insert dynamic checks in
                 the resulting hardware that provably enforce a given
                 information flow policy at execution time. We present
                 Sapper's design and formal semantics along with a proof
                 sketch of its security. In addition, we have
                 implemented a compiler for Sapper and used it to create
                 a non-trivial secure embedded processor with many
                 modern microarchitectural features. We empirically
                 evaluate the resulting hardware's area and energy
                 overhead and compare them with alternative designs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS '14 conference proceedings.",
}

@Article{Banabic:2014:FTM,
  author =       "Radu Banabic and George Candea and Rachid Guerraoui",
  title =        "Finding {Trojan} message vulnerabilities in
                 distributed systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "1",
  pages =        "113--126",
  month =        mar,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2654822.2541984",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:12:13 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Trojan messages are messages that seem correct to the
                 receiver but cannot be generated by any correct sender.
                 Such messages constitute major vulnerability points of
                 a distributed system---they constitute ideal targets
                 for a malicious actor and facilitate failure
                 propagation across nodes. We describe Achilles, a tool
                 that searches for Trojan messages in a distributed
                 system. Achilles uses dynamic white-box analysis on the
                 distributed system binaries in order to infer the
                 predicate that defines messages parsed by receiver
                 nodes and generated by sender nodes, respectively, and
                 then computes Trojan messages as the difference between
                 the two. We apply Achilles on implementations of real
                 distributed systems: FSP, a file transfer application,
                 and PBFT, a Byzantine-fault-tolerant state machine
                 replication library. Achilles discovered a new bug in
                 FSP and rediscovered a previously known vulnerability
                 in PBFT. In our evaluation we demonstrate that our
                 approach can perform orders of magnitude better than
                 general approaches based on regular fuzzing and
                 symbolic execution.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS '14 conference proceedings.",
}

@Article{Delimitrou:2014:QRE,
  author =       "Christina Delimitrou and Christos Kozyrakis",
  title =        "{Quasar}: resource-efficient and {QoS}-aware cluster
                 management",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "1",
  pages =        "127--144",
  month =        mar,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2654822.2541941",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:12:13 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Cloud computing promises flexibility and high
                 performance for users and high cost-efficiency for
                 operators. Nevertheless, most cloud facilities operate
                 at very low utilization, hurting both cost
                 effectiveness and future scalability. We present
                 Quasar, a cluster management system that increases
                 resource utilization while providing consistently high
                 application performance. Quasar employs three
                 techniques. First, it does not rely on resource
                 reservations, which lead to underutilization as users
                 do not necessarily understand workload dynamics and
                 physical resource requirements of complex codebases.
                 Instead, users express performance constraints for each
                 workload, letting Quasar determine the right amount of
                 resources to meet these constraints at any point.
                 Second, Quasar uses classification techniques to
                 quickly and accurately determine the impact of the
                 amount of resources (scale-out and scale-up), type of
                 resources, and interference on performance for each
                 workload and dataset. Third, it uses the classification
                 results to jointly perform resource allocation and
                 assignment, quickly exploring the large space of
                 options for an efficient way to pack workloads on
                 available resources. Quasar monitors workload
                 performance and adjusts resource allocation and
                 assignment when needed. We evaluate Quasar over a wide
                 range of workload scenarios, including combinations of
                 distributed analytics frameworks and low-latency,
                 stateful services, both on a local cluster and a
                 cluster of dedicated EC2 servers. At steady state,
                 Quasar improves resource utilization by 47\% in the
                 200-server EC2 cluster, while meeting performance
                 constraints for workloads of all types.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS '14 conference proceedings.",
}

@Article{Zahedi:2014:RRE,
  author =       "Seyed Majid Zahedi and Benjamin C. Lee",
  title =        "{REF}: resource elasticity fairness with sharing
                 incentives for multiprocessors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "1",
  pages =        "145--160",
  month =        mar,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2654822.2541962",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:12:13 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "With the democratization of cloud and datacenter
                 computing, users increasingly share large hardware
                 platforms. In this setting, architects encounter two
                 challenges: sharing fairly and sharing multiple
                 resources. Drawing on economic game-theory, we rethink
                 fairness in computer architecture. A fair allocation
                 must provide sharing incentives (SI), envy-freeness
                 (EF), and Pareto efficiency (PE). We show that
                 Cobb--Douglas utility functions are well suited to
                 modeling user preferences for cache capacity and memory
                 bandwidth. And we present an allocation mechanism that
                 uses Cobb--Douglas preferences to determine each user's
                 fair share of the hardware. This mechanism provably
                 guarantees SI, EF, and PE, as well as
                 strategy-proofness in the large (SPL). And it does so
                 with modest performance penalties, less than 10\\%
                 throughput loss, relative to an unfair mechanism.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS '14 conference proceedings.",
}

@Article{Muthukaruppan:2014:PTB,
  author =       "Thannirmalai Somu Muthukaruppan and Anuj Pathania and
                 Tulika Mitra",
  title =        "Price theory based power management for heterogeneous
                 multi-cores",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "1",
  pages =        "161--176",
  month =        mar,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2654822.2541974",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:12:13 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Heterogeneous multi-cores that integrate cores with
                 different power performance characteristics are
                 promising alternatives to homogeneous systems in
                 energy- and thermally constrained environments.
                 However, the heterogeneity imposes significant
                 challenges to power-aware scheduling. We present a
                 price theory-based dynamic power management framework
                 for heterogeneous multi-cores that co-ordinates various
                 energy savings opportunities, such as dynamic
                 voltage/frequency scaling, load balancing, and task
                 migration in tandem, to achieve the best
                 power-performance characteristics. Unlike existing
                 centralized power management frameworks, ours is
                 distributed and hence scalable with minimal runtime
                 overhead. We design and implement the framework within
                 Linux operating system on ARM big.LITTLE heterogeneous
                 multi-core platform. Experimental evaluation confirms
                 the advantages of our approach compared to the
                 state-of-the-art techniques for power management in
                 heterogeneous multi-cores.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS '14 conference proceedings.",
}

@Article{Wang:2014:UBP,
  author =       "Di Wang and Sriram Govindan and Anand Sivasubramaniam
                 and Aman Kansal and Jie Liu and Badriddine Khessib",
  title =        "Underprovisioning backup power infrastructure for
                 datacenters",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "1",
  pages =        "177--192",
  month =        mar,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2654822.2541966",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:12:13 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "While there has been prior work to underprovision the
                 power distribution infrastructure for a datacenter to
                 save costs, the ability to underprovision the backup
                 power infrastructure, which contributes significantly
                 to capital costs, is little explored. There are two
                 main components in the backup infrastructure --- Diesel
                 Generators (DGs) and UPS units --- which can both be
                 underprovisioned (or even removed) in terms of their
                 power and/or energy capacities. However, embarking on
                 such underprovisioning mandates studying several
                 ramifications --- the resulting cost savings, the lower
                 availability, and the performance and state loss
                 consequences on individual applications ---
                 concurrently. This paper presents the first such study,
                 considering cost, availability, performance and
                 application consequences of underprovisioning the
                 backup power infrastructure. We present a framework to
                 quantify the cost of backup capacity that is
                 provisioned, and implement techniques leveraging
                 existing software and hardware mechanisms to provide as
                 seamless an operation as possible for an application
                 within the provisioned backup capacity during a power
                 outage. We evaluate the cost-performance-availability
                 trade-offs for different levels of backup
                 underprovisioning for applications with diverse
                 reliance on the backup infrastructure. Our results show
                 that one may be able to completely do away with DGs,
                 compensating for it with additional UPS energy
                 capacities, to significantly cut costs and still be
                 able to handle power outages lasting as high as 40
                 minutes (which constitute bulk of the outages).
                 Further, we can push the limits of outage duration that
                 can be handled in a cost-effective manner, if
                 applications are willing to tolerate degraded
                 performance during the outage. Our evaluations also
                 show that different applications react differently to
                 the outage handling mechanisms, and that the efficacy
                 of the mechanisms is sensitive to the outage duration.
                 The insights from this paper can spur new opportunities
                 for future work on backup power infrastructure
                 optimization.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS '14 conference proceedings.",
}

@Article{Yu:2014:CPR,
  author =       "Xiao Yu and Shi Han and Dongmei Zhang and Tao Xie",
  title =        "Comprehending performance from real-world execution
                 traces: a device-driver case",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "1",
  pages =        "193--206",
  month =        mar,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2654822.2541968",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:12:13 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Real-world execution traces record performance
                 problems that are likely perceived at deployment sites.
                 However, those problems can be rooted subtly and deeply
                 into system layers or other components far from the
                 place where delays are initially observed. To tackle
                 challenges of identifying deeply rooted problems, we
                 propose a new trace-based approach consisting of two
                 steps: impact analysis and causality analysis. The
                 impact analysis measures performance impacts on a
                 component basis, and the causality analysis discovers
                 patterns of runtime behaviors that are likely to cause
                 the measured impacts. The discovered patterns can help
                 performance analysts quickly identify root causes of
                 perceived performance problems. We instantiate our
                 approach to study the performance of device drivers on
                 over 19,500 real-world execution traces. The impact
                 analysis shows that device drivers constitute a
                 non-trivial part ($ \approx 38$) in the overall system
                 performance, and a big part ($ \approx 26$) is due to
                 interactions between drivers. The causality analysis
                 effectively discovers highly suspicious and high-impact
                 behavioral patterns in device drivers, examined and
                 confirmed by our automated evaluation, developers, and
                 performance analysts.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS '14 conference proceedings.",
}

@Article{Arulraj:2014:LST,
  author =       "Joy Arulraj and Guoliang Jin and Shan Lu",
  title =        "Leveraging the short-term memory of hardware to
                 diagnose production-run software failures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "1",
  pages =        "207--222",
  month =        mar,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2654822.2541973",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:12:13 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Failures caused by software bugs are widespread in
                 production runs, causing severe losses for end users.
                 Unfortunately, diagnosing production-run failures is
                 challenging. Existing work cannot satisfy privacy,
                 run-time overhead, diagnosis capability, and diagnosis
                 latency requirements all at once. This paper designs a
                 low overhead, low latency, privacy preserving
                 production-run failure diagnosis system based on two
                 observations. First, short-term memory of program
                 execution is often sufficient for failure diagnosis, as
                 many bugs have short propagation distances. Second,
                 maintaining a short-term memory of execution is much
                 cheaper than maintaining a record of the whole
                 execution. Following these observations, we first
                 identify an existing hardware unit, Last Branch Record
                 (LBR), that records the last few taken branches to help
                 diagnose sequential bugs. We then propose a simple
                 hardware extension, Last Cache-coherence Record (LCR),
                 to record the last few cache accesses with specified
                 coherence states and hence help diagnose concurrency
                 bugs. Finally, we design LBRA and LCRA to automatically
                 locate failure root causes using LBR and LCR. Our
                 evaluation uses 31 real-world sequential and
                 concurrency bug failures from 18 representative
                 open-source software. The results show that with just
                 16 record entries, LBR and LCR enable our system to
                 automatically locate the root causes for 27 out of 31
                 failures, with less than 3\% run-time overhead. As our
                 system does not rely on sampling, \ldots{}",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS '14 conference proceedings.",
}

@Article{Honarmand:2014:RRR,
  author =       "Nima Honarmand and Josep Torrellas",
  title =        "{RelaxReplay}: record and replay for
                 relaxed-consistency multiprocessors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "1",
  pages =        "223--238",
  month =        mar,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2654822.2541979",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:12:13 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Record and Deterministic Replay (RnR) of multithreaded
                 programs on relaxed-consistency multiprocessors has
                 been a long-standing problem. While there are designs
                 that work for Total Store Ordering (TSO), finding a
                 general solution that is able to record the access
                 reordering allowed by any relaxed-consistency model has
                 proved challenging. This paper presents the first
                 complete solution for hard-ware-assisted memory race
                 recording that works for any relaxed-consistency model
                 of current processors. With the scheme, called
                 RelaxReplay, we can build an RnR system for any
                 relaxed-consistency model and coherence protocol.
                 RelaxReplay's core innovation is a new way of capturing
                 memory access reordering. Each memory instruction goes
                 through a post-completion in-order counting step that
                 detects any reordering, and efficiently records it. We
                 evaluate RelaxReplay with simulations of an 8-core
                 release-consistent multicore running SPLASH-2 programs.
                 We observe that RelaxReplay induces negligible overhead
                 during recording. In addition, the average size of the
                 log produced is comparable to the log sizes reported
                 for existing solutions, and still very small compared
                 to the memory bandwidth of modern machines. Finally,
                 deterministic replay is efficient and needs minimal
                 hardware support.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS '14 conference proceedings.",
}

@Article{Bucur:2014:PSE,
  author =       "Stefan Bucur and Johannes Kinder and George Candea",
  title =        "Prototyping symbolic execution engines for interpreted
                 languages",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "1",
  pages =        "239--254",
  month =        mar,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2654822.2541977",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:12:13 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Symbolic execution is being successfully used to
                 automatically test statically compiled code. However,
                 increasingly more systems and applications are written
                 in dynamic interpreted languages like Python. Building
                 a new symbolic execution engine is a monumental effort,
                 and so is keeping it up-to-date as the target language
                 evolves. Furthermore, ambiguous language specifications
                 lead to their implementation in a symbolic execution
                 engine potentially differing from the production
                 interpreter in subtle ways. We address these challenges
                 by flipping the problem and using the interpreter
                 itself as a specification of the language semantics. We
                 present a recipe and tool (called Chef) for turning a
                 vanilla interpreter into a sound and complete symbolic
                 execution engine. Chef symbolically executes the target
                 program by symbolically executing the interpreter's
                 binary while exploiting inferred knowledge about the
                 program's high-level structure. Using Chef, we
                 developed a symbolic execution engine for Python in 5
                 person-days and one for Lua in 3 person-days. They
                 offer complete and faithful coverage of language
                 features in a way that keeps up with future language
                 versions at near-zero cost. Chef-produced engines are
                 up to 1000 times more performant than if directly
                 executing the interpreter symbolically without Chef.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS '14 conference proceedings.",
}

@Article{Wu:2014:QAD,
  author =       "Lisa Wu and Andrea Lottarini and Timothy K. Paine and
                 Martha A. Kim and Kenneth A. Ross",
  title =        "{Q100}: the architecture and design of a database
                 processing unit",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "1",
  pages =        "255--268",
  month =        mar,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2654822.2541961",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:12:13 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "In this paper, we propose Database Processing Units,
                 or DPUs, a class of domain-specific database processors
                 that can efficiently handle database applications. As a
                 proof of concept, we present the instruction set
                 architecture, microarchitecture, and hardware
                 implementation of one DPU, called Q100. The Q100 has a
                 collection of heterogeneous ASIC tiles that process
                 relational tables and columns quickly and
                 energy-efficiently. The architecture uses coarse
                 grained instructions that manipulate streams of data,
                 thereby maximizing pipeline and data parallelism, and
                 minimizing the need to time multiplex the accelerator
                 tiles and spill inter- mediate results to memory. This
                 work explores a Q100 de- sign space of 150
                 configurations, selecting three for further analysis: a
                 small, power-conscious implementation, a high-
                 performance implementation, and a balanced design that
                 maximizes performance per Watt. We then demonstrate
                 that the power-conscious Q100 handles the TPC-H queries
                 with three orders of magnitude less energy than a state
                 of the art software DBMS, while the
                 performance-oriented design out- performs the same DBMS
                 by 70X.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS '14 conference proceedings.",
}

@Article{Chen:2014:DSF,
  author =       "Tianshi Chen and Zidong Du and Ninghui Sun and Jia
                 Wang and Chengyong Wu and Yunji Chen and Olivier
                 Temam",
  title =        "{DianNao}: a small-footprint high-throughput
                 accelerator for ubiquitous machine-learning",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "1",
  pages =        "269--284",
  month =        mar,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2654822.2541967",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:12:13 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Machine-Learning tasks are becoming pervasive in a
                 broad range of domains, and in a broad range of systems
                 (from embedded systems to data centers). At the same
                 time, a small set of machine-learning algorithms
                 (especially Convolutional and Deep Neural Networks,
                 i.e., CNNs and DNNs) are proving to be state-of-the-art
                 across many applications. As architectures evolve
                 towards heterogeneous multi-cores composed of a mix of
                 cores and accelerators, a machine-learning accelerator
                 can achieve the rare combination of efficiency (due to
                 the small number of target algorithms) and broad
                 application scope. Until now, most machine-learning
                 accelerator designs have focused on efficiently
                 implementing the computational part of the algorithms.
                 However, recent state-of-the-art CNNs and DNNs are
                 characterized by their large size. In this study, we
                 design an accelerator for large-scale CNNs and DNNs,
                 with a special emphasis on the impact of memory on
                 accelerator design, performance and energy. We show
                 that it is possible to design an accelerator with a
                 high throughput, capable of performing 452 GOP/s (key
                 NN operations such as synaptic weight multiplications
                 and neurons outputs additions) in a small footprint of
                 3.02 mm2 and 485 mW; compared to a 128-bit 2GHz SIMD
                 processor, the accelerator is 117.87x faster, and it
                 can reduce the total energy by 21.08x. The accelerator
                 characteristics are obtained after layout at 65 nm.
                 Such a high throughput in a small footprint can open up
                 the usage of state-of-the-art machine-learning
                 algorithms in a broad set of systems and for a broad
                 set of applications.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS '14 conference proceedings.",
}

@Article{Lin:2014:KMO,
  author =       "Felix Xiaozhu Lin and Zhen Wang and Lin Zhong",
  title =        "{K2}: a mobile operating system for heterogeneous
                 coherence domains",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "1",
  pages =        "285--300",
  month =        mar,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2654822.2541975",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:12:13 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Mobile System-on-Chips (SoC) that incorporate
                 heterogeneous coherence domains promise high energy
                 efficiency to a wide range of mobile applications, yet
                 are difficult to program. To exploit the architecture,
                 a desirable, yet missing capability is to replicate
                 operating system (OS) services over multiple coherence
                 domains with minimum inter-domain communication. In
                 designing such an OS, we set three goals: to ease
                 application development, to simplify OS engineering,
                 and to preserve the current OS performance. To this
                 end, we identify a shared-most OS model for multiple
                 coherence domains: creating per-domain instances of
                 core OS services with no shared state, while enabling
                 other extended OS services to share state across
                 domains. To test the model, we build K2, a prototype OS
                 on the TI OMAP4 SoC, by reusing most of the Linux 3.4
                 source. K2 presents a single system image to
                 applications with its two kernels running on top of the
                 two coherence domains of OMAP4. The two kernels have
                 independent instances of core OS services, such as page
                 allocator and interrupt management, as coordinated by
                 K2; the two kernels share most extended OS services,
                 such as device drivers, whose state is kept coherent
                 transparently by K2. Despite platform constraints and
                 unoptimized code, K2 improves energy efficiency for
                 light OS workloads by 8x-10x, while incurring less than
                 6\% performance overhead for a device driver shared
                 between kernels. Our experiences with K2 show that the
                 shared-most model is promising.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS '14 conference proceedings.",
}

@Article{Menychtas:2014:DSF,
  author =       "Konstantinos Menychtas and Kai Shen and Michael L.
                 Scott",
  title =        "Disengaged scheduling for fair, protected access to
                 fast computational accelerators",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "1",
  pages =        "301--316",
  month =        mar,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2654822.2541963",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:12:13 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Today's operating systems treat GPUs and other
                 computational accelerators as if they were simple
                 devices, with bounded and predictable response times.
                 With accelerators assuming an increasing share of the
                 workload on modern machines, this strategy is already
                 problematic, and likely to become untenable soon. If
                 the operating system is to enforce fair sharing of the
                 machine, it must assume responsibility for accelerator
                 scheduling and resource management. Fair, safe
                 scheduling is a particular challenge on fast
                 accelerators, which allow applications to avoid
                 kernel-crossing overhead by interacting directly with
                 the device. We propose a disengaged scheduling strategy
                 in which the kernel intercedes between applications and
                 the accelerator on an infrequent basis, to monitor
                 their use of accelerator cycles and to determine which
                 applications should be granted access over the next
                 time interval. Our strategy assumes a well defined,
                 narrow interface exported by the accelerator. We build
                 upon such an interface, systematically inferred for the
                 latest Nvidia GPUs. We construct several example
                 schedulers, including Disengaged Timeslice with overuse
                 control that guarantees fairness and Disengaged Fair
                 Queueing that is effective in limiting resource
                 idleness, but probabilistic. Both schedulers ensure
                 fair sharing of the GPU, even among uncooperative or
                 adversarial applications; Disengaged Fair Queueing
                 incurs a 4\% overhead on average (max 18\%) compared to
                 direct device access across our evaluation scenarios.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS '14 conference proceedings.",
}

@Article{Gehlhaar:2014:NPN,
  author =       "Jeff Gehlhaar",
  title =        "Neuromorphic processing: a new frontier in scaling
                 computer architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "1",
  pages =        "317--318",
  month =        mar,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2654822.2564710",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:12:13 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "The desire to build a computer that operates in the
                 same manner as our brains is as old as the computer
                 itself. Although computer engineering has made great
                 strides in hardware performance as a result of Dennard
                 scaling, and even great advances in 'brain like'
                 computation, the field still struggles to move beyond
                 sequential, analytical computing architectures.
                 Neuromorphic systems are being developed to transcend
                 the barriers imposed by silicon power consumption,
                 develop new algorithms that help machines achieve
                 cognitive behaviors, and both exploit and enable
                 further research in neuroscience. In this talk I will
                 discuss a system implementing spiking neural networks.
                 These systems hold the promise of an architecture that
                 is event based, broad and shallow, and thus more power
                 efficient than conventional computing solutions. This
                 new approach to computation based on modeling the brain
                 and its simple but highly connected units presents a
                 host of new challenges. Hardware faces tradeoffs such
                 as density or lower power at the cost of high
                 interconnection overhead. Consequently, software
                 systems must face choices about new language design.
                 Highly distributed hardware systems require complex
                 place and route algorithms to distribute the execution
                 of the neural network across a large number of highly
                 interconnected processing units. Finally, the overall
                 design, simulation and testing process has to be
                 entirely reimagined. We discuss these issues in the
                 context of the Zeroth processor and how this approach
                 compares to other neuromorphic systems that are
                 becoming available.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS '14 conference proceedings.",
}

@Article{Sani:2014:PDF,
  author =       "Ardalan Amiri Sani and Kevin Boos and Shaopu Qin and
                 Lin Zhong",
  title =        "{I/O} paravirtualization at the device file boundary",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "1",
  pages =        "319--332",
  month =        mar,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2654822.2541943",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:12:13 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Paravirtualization is an important I/O virtualization
                 technology since it uniquely provides all of the
                 following benefits: the ability to share the device
                 between multiple VMs, support for legacy devices
                 without virtualization hardware, and high performance.
                 However, existing paravirtualization solutions have one
                 main limitation: they only support one I/O device
                 class, and would require significant engineering effort
                 to support new device classes and features. In this
                 paper, we present Paradice, a solution that vastly
                 simplifies I/O paravirtualization by using a common
                 paravirtualization boundary for various I/O device
                 classes: Unix device files. Using this boundary, the
                 paravirtual drivers simply act as a class-agnostic
                 indirection layer between the application and the
                 actual device driver. We address two fundamental
                 challenges: supporting cross-VM driver memory
                 operations without changes to applications or device
                 drivers and providing fault and device data isolation
                 between guest VMs despite device driver bugs. We
                 implement Paradice for x86, the Xen hypervisor, and the
                 Linux and FreeBSD OSes. Our implementation
                 paravirtualizes various GPUs, input devices, cameras,
                 an audio device, and an Ethernet card for the netmap
                 framework with ~7700 LoC, of which only ~900 are device
                 class-specific. Our measurements show that Paradice
                 achieves performance close to native for different
                 devices and applications including netmap, 3D HD games,
                 and OpenCL applications.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS '14 conference proceedings.",
}

@Article{Dall:2014:KAD,
  author =       "Christoffer Dall and Jason Nieh",
  title =        "{{KVM\slash} ARM}: the design and implementation of
                 the {Linux ARM} hypervisor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "1",
  pages =        "333--348",
  month =        mar,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2654822.2541946",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:12:13 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "As ARM CPUs become increasingly common in mobile
                 devices and servers, there is a growing demand for
                 providing the benefits of virtualization for ARM-based
                 devices. We present our experiences building the Linux
                 ARM hypervisor, KVM/ARM, the first full system ARM
                 virtualization solution that can run unmodified guest
                 operating systems on ARM multicore hardware. KVM/ARM
                 introduces split-mode virtualization, allowing a
                 hypervisor to split its execution across CPU modes and
                 be integrated into the Linux kernel. This allows
                 KVM/ARM to leverage existing Linux hardware support and
                 functionality to simplify hypervisor development and
                 maintainability while utilizing recent ARM hardware
                 virtualization extensions to run virtual machines with
                 comparable performance to native execution. KVM/ARM has
                 been successfully merged into the mainline Linux
                 kernel, ensuring that it will gain wide adoption as the
                 virtualization platform of choice for ARM. We provide
                 the first measurements on real hardware of a complete
                 hypervisor using ARM hardware virtualization support.
                 Our results demonstrate that KVM/ARM has modest
                 virtualization performance and power costs, and can
                 achieve lower performance and power costs compared to
                 x86-based Linux virtualization on multicore hardware.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS '14 conference proceedings.",
}

@Article{Amit:2014:VMS,
  author =       "Nadav Amit and Dan Tsafrir and Assaf Schuster",
  title =        "{VSwapper}: a memory swapper for virtualized
                 environments",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "1",
  pages =        "349--366",
  month =        mar,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2654822.2541969",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:12:13 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "The number of guest virtual machines that can be
                 consolidated on one physical host is typically limited
                 by the memory size, motivating memory overcommitment.
                 Guests are given a choice to either install a
                 ``balloon'' driver to coordinate the overcommitment
                 activity, or to experience degraded performance due to
                 uncooperative swapping. Ballooning, however, is not a
                 complete solution, as hosts must still fall back on
                 uncooperative swapping in various circumstances.
                 Additionally, ballooning takes time to accommodate
                 change, and so guests might experience degraded
                 performance under changing conditions. Our goal is to
                 improve the performance of hosts when they fall back on
                 uncooperative swapping and/or operate under changing
                 load conditions. We carefully isolate and characterize
                 the causes for the associated poor performance, which
                 include various types of superfluous swap operations,
                 decayed swap file sequentiality, and ineffective
                 prefetch decisions upon page faults. We address these
                 problems by implementing VSwapper, a guest-agnostic
                 memory swapper for virtual environments that allows
                 efficient, uncooperative overcommitment. With inactive
                 ballooning, VSwapper yields up to an order of magnitude
                 performance improvement. Combined with ballooning,
                 VSwapper can achieve up to double the performance under
                 changing load conditions.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS '14 conference proceedings.",
}

@Article{Andrus:2014:CNE,
  author =       "Jeremy Andrus and Alexander Van't Hof and Naser
                 AlDuaij and Christoffer Dall and Nicolas Viennot and
                 Jason Nieh",
  title =        "{Cider}: native execution of {iOS} apps on {Android}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "1",
  pages =        "367--382",
  month =        mar,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2654822.2541972",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:12:13 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "We present Cider, an operating system compatibility
                 architecture that can run applications built for
                 different mobile ecosystems, iOS or Android, together
                 on the same smartphone or tablet. Cider enhances the
                 domestic operating system, Android, of a device with
                 kernel-managed, per-thread personas to mimic the
                 application binary interface of a foreign operating
                 system, iOS, enabling it to run unmodified foreign
                 binaries. This is accomplished using a novel
                 combination of binary compatibility techniques
                 including two new mechanisms: compile-time code
                 adaptation, and diplomatic functions. Compile-time code
                 adaptation enables existing unmodified foreign source
                 code to be reused in the domestic kernel, reducing
                 implementation effort required to support multiple
                 binary interfaces for executing domestic and foreign
                 applications. Diplomatic functions leverage per-thread
                 personas, and allow foreign applications to use
                 domestic libraries to access proprietary software and
                 hardware interfaces. We have built a Cider prototype,
                 and demonstrate that it imposes modest performance
                 overhead and runs unmodified iOS and Android
                 applications together on a Google Nexus tablet running
                 the latest version of Android.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS '14 conference proceedings.",
}

@Article{Litz:2014:STR,
  author =       "Heiner Litz and David Cheriton and Amin Firoozshahian
                 and Omid Azizi and John P. Stevenson",
  title =        "{SI-TM}: reducing transactional memory abort rates
                 through snapshot isolation",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "1",
  pages =        "383--398",
  month =        mar,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2654822.2541952",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:12:13 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Transactional memory represents an attractive
                 conceptual model for programming concurrent
                 applications. Unfortunately, high transaction abort
                 rates can cause significant performance degradation.
                 Conventional transactional memory realizations not only
                 pessimistically abort transactions on every read-write
                 conflict but also because of false sharing, cache
                 evictions, TLB misses, page faults and interrupts.
                 Consequently, the use of transactions needs to be
                 restricted to a very small number of operations to
                 achieve predictable performance, thereby, limiting its
                 benefit to programming simplification. In this paper,
                 we investigate snapshot isolation transactional memory
                 in which transactions operate on memory snapshots that
                 always guarantee consistent reads. By exploiting
                 snapshots, an established database model of
                 transactions, transactions can ignore read-write
                 conflicts and only need to abort on write-write
                 conflicts. Our implementation utilizes a memory
                 controller that supports multiversion memory, to
                 efficiently support snapshotting in hardware.We show
                 that snapshot isolation can reduce the number of aborts
                 in some cases by three orders of magnitude and improve
                 performance by up to 20x.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS '14 conference proceedings.",
}

@Article{Ruan:2014:TLC,
  author =       "Wenjia Ruan and Trilok Vyas and Yujie Liu and Michael
                 Spear",
  title =        "Transactionalizing legacy code: an experience report
                 using {GCC} and {Memcached}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "1",
  pages =        "399--412",
  month =        mar,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2654822.2541960",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:12:13 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/gnu.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "The addition of transactional memory (TM) support to
                 existing languages provides the opportunity to create
                 new software from scratch using transactions, and also
                 to simplify or extend legacy code by replacing existing
                 synchronization with language-level transactions. In
                 this paper, we describe our experiences
                 transactionalizing the memcached application through
                 the use of the GCC implementation of the Draft C++ TM
                 Specification. We present experiences and
                 recommendations that we hope will guide the effort to
                 integrate TM into languages, and that may also
                 contribute to the growing collective knowledge about
                 how programmers can begin to exploit TM in existing
                 production-quality software.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS '14 conference proceedings.",
}

@Article{Morrison:2014:FFW,
  author =       "Adam Morrison and Yehuda Afek",
  title =        "Fence-free work stealing on bounded {TSO} processors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "1",
  pages =        "413--426",
  month =        mar,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2654822.2541987",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:12:13 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Work stealing is the method of choice for load
                 balancing in task parallel programming languages and
                 frameworks. Yet despite considerable effort invested in
                 optimizing work stealing task queues, existing
                 algorithms issue a costly memory fence when removing a
                 task, and these fences are believed to be necessary for
                 correctness. This paper refutes this belief,
                 demonstrating work stealing algorithms in which a
                 worker does not issue a memory fence for
                 microarchitectures with a bounded total store ordering
                 (TSO) memory model. Bounded TSO is a novel restriction
                 of TSO --- capturing mainstream x86 and SPARC TSO
                 processors --- that bounds the number of stores a load
                 can be reordered with. Our algorithms eliminate the
                 memory fence penalty, improving the running time of a
                 suite of parallel benchmarks on modern x86 multicore
                 processors by 7\%-11\% on average (and up to 23\%),
                 compared to the Cilk and Chase--Lev work stealing
                 queues.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS '14 conference proceedings.",
}

@Article{Hower:2014:HRF,
  author =       "Derek R. Hower and Blake A. Hechtman and Bradford M.
                 Beckmann and Benedict R. Gaster and Mark D. Hill and
                 Steven K. Reinhardt and David A. Wood",
  title =        "Heterogeneous-race-free memory models",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "1",
  pages =        "427--440",
  month =        mar,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2654822.2541981",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:12:13 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Commodity heterogeneous systems (e.g., integrated CPUs
                 and GPUs), now support a unified, shared memory address
                 space for all components. Because the latency of global
                 communication in a heterogeneous system can be
                 prohibitively high, heterogeneous systems (unlike
                 homogeneous CPU systems) provide synchronization
                 mechanisms that only guarantee ordering among a subset
                 of threads, which we call a scope. Unfortunately, the
                 consequences and semantics of these scoped operations
                 are not yet well understood. Without a formal and
                 approachable model to reason about the behavior of
                 these operations, we risk an array of portability and
                 performance issues. In this paper, we embrace scoped
                 synchronization with a new class of memory consistency
                 models that add scoped synchronization to
                 data-race-free models like those of C++ and Java.
                 Called sequential consistency for
                 heterogeneous-race-free (SC for HRF), the new models
                 guarantee SC for programs with ``sufficient''
                 synchronization (no data races) of ``sufficient''
                 scope. We discuss two such models. The first,
                 HRF-direct, works well for programs with highly regular
                 parallelism. The second, HRF-indirect, builds on
                 HRF-direct by allowing synchronization using different
                 scopes in some cases involving transitive
                 communication. We quantitatively show that HRF-indirect
                 encourages forward-looking programs with irregular
                 parallelism by showing up to a 10\% performance
                 increase in a task runtime for GPUs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS '14 conference proceedings.",
}

@Article{Jung:2014:TNS,
  author =       "Myoungsoo Jung and Wonil Choi and John Shalf and
                 Mahmut Taylan Kandemir",
  title =        "{Triple-A}: a Non-{SSD} based autonomic all-flash
                 array for high performance storage systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "1",
  pages =        "441--454",
  month =        mar,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2654822.2541953",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:12:13 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Solid State Disk (SSD) arrays are in a position to (as
                 least partially) replace spinning disk arrays in high
                 performance computing (HPC) systems due to their better
                 performance and lower power consumption. However, these
                 emerging SSD arrays are facing enormous challenges,
                 which are not observed in disk-based arrays.
                 Specifically, we observe that the performance of SSD
                 arrays can significantly degrade due to various
                 array-level resource contentions. In addition, their
                 maintenance costs exponentially increase over time,
                 which renders them difficult to deploy widely in HPC
                 systems. To address these challenges, we propose
                 Triple-A, a non-SSD based Autonomic All-Flash Array,
                 which is a self-optimizing, from-scratch NAND flash
                 cluster. Triple-A can detect two different types of
                 resource contentions and autonomically alleviate them
                 by reshaping the physical data-layout on its flash
                 array network. Our experimental evaluation using both
                 real workloads and a micro-benchmark show that Triple-A
                 can offer a 53\% higher sustained throughput and a 80\%
                 lower I/O latency than non-autonomic SSD arrays.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS '14 conference proceedings.",
}

@Article{Liu:2014:NDU,
  author =       "Ren-Shuo Liu and De-Yu Shen and Chia-Lin Yang and
                 Shun-Chih Yu and Cheng-Yuan Michael Wang",
  title =        "{NVM} duet: unified working memory and persistent
                 store architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "1",
  pages =        "455--470",
  month =        mar,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2654822.2541957",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:12:13 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Emerging non-volatile memory (NVM) technologies have
                 gained a lot of attention recently. The
                 byte-addressability and high density of NVM enable
                 computer architects to build large-scale main memory
                 systems. NVM has also been shown to be a promising
                 alternative to conventional persistent store. With NVM,
                 programmers can persistently retain in-memory data
                 structures without writing them to disk. Therefore, one
                 can envision that in the future, NVM will play the role
                 of both working memory and persistent store at the same
                 time. Persistent store demands consistency and
                 durability guarantees, thereby imposing new design
                 constraints on the memory system. Consistency is
                 achieved at the expense of serializing multiple write
                 operations. Durability requires memory cells to
                 guarantee non-volatility and thus reduces the write
                 speed. Therefore, a unified architecture oblivious to
                 these two use cases would lead to suboptimal design. In
                 this paper, we propose a novel unified working memory
                 and persistent store architecture, NVM Duet, which
                 provides the required consistency and durability
                 guarantees for persistent store while relaxing these
                 constraints if accesses to NVM are for working memory.
                 A cross-layer design approach is adopted to achieve the
                 design goal. Overall, simulation results demonstrate
                 that NVM Duet achieves up to 1.68x (1.32x on average)
                 speedup compared with the baseline design.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS '14 conference proceedings.",
}

@Article{Ouyang:2014:SSD,
  author =       "Jian Ouyang and Shiding Lin and Song Jiang and Zhenyu
                 Hou and Yong Wang and Yuanzheng Wang",
  title =        "{SDF}: software-defined flash for {Web}-scale
                 {Internet} storage systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "1",
  pages =        "471--484",
  month =        mar,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2654822.2541959",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:12:13 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "In the last several years hundreds of thousands of
                 SSDs have been deployed in the data centers of Baidu,
                 China's largest Internet search company. Currently only
                 40\\% or less of the raw bandwidth of the flash memory
                 in the SSDs is delivered by the storage system to the
                 applications. Moreover, because of space
                 over-provisioning in the SSD to accommodate
                 non-sequential or random writes, and additionally,
                 parity coding across flash channels, typically only
                 50-70\\% of the raw capacity of a commodity SSD can be
                 used for user data. Given the large scale of Baidu's
                 data center, making the most effective use of its SSDs
                 is of great importance. Specifically, we seek to
                 maximize both bandwidth and usable capacity. To achieve
                 this goal we propose {\em software-defined flash}
                 (SDF), a hardware/software co-designed storage system
                 to maximally exploit the performance characteristics of
                 flash memory in the context of our workloads. SDF
                 exposes individual flash channels to the host software
                 and eliminates space over-provisioning. The host
                 software, given direct access to the raw flash channels
                 of the SSD, can effectively organize its data and
                 schedule its data access to better realize the SSD's
                 raw performance potential. Currently more than 3000
                 SDFs have been deployed in Baidu's storage system that
                 supports its web page and image repository services.
                 Our measurements show that SDF can deliver
                 approximately 95\% of the raw flash bandwidth and
                 provide 99\% of the flash capacity for user data. SDF
                 increases I/O bandwidth by 300\\% and reduces per-GB
                 hardware cost by 50\% on average compared with the
                 commodity-SSD-based system used at Baidu.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS '14 conference proceedings.",
}

@Article{Gutierrez:2014:ISS,
  author =       "Anthony Gutierrez and Michael Cieslak and Bharan
                 Giridhar and Ronald G. Dreslinski and Luis Ceze and
                 Trevor Mudge",
  title =        "Integrated {$3$D}-stacked server designs for
                 increasing physical density of key-value stores",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "1",
  pages =        "485--498",
  month =        mar,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2654822.2541951",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:12:13 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Key-value stores, such as Memcached, have been used to
                 scale web services since the beginning of the Web 2.0
                 era. Data center real estate is expensive, and several
                 industry experts we have spoken to have suggested that
                 a significant portion of their data center space is
                 devoted to key value stores. Despite its wide-spread
                 use, there is little in the way of hardware
                 specialization for increasing the efficiency and
                 density of Memcached; it is currently deployed on
                 commodity servers that contain high-end CPUs designed
                 to extract as much instruction-level parallelism as
                 possible. Out-of-order CPUs, however have been shown to
                 be inefficient when running Memcached. To address
                 Memcached efficiency issues, we propose two
                 architectures using 3D stacking to increase data
                 storage efficiency. Our first 3D architecture, Mercury,
                 consists of stacks of ARM Cortex-A7 cores with 4GB of
                 DRAM, as well as NICs. Our second architecture,
                 Iridium, replaces DRAM with NAND Flash to improve
                 density. We explore, through simulation, the potential
                 efficiency benefits of running Memcached on servers
                 that use 3D-stacking to closely integrate low-power
                 CPUs with NICs and memory. With Mercury we demonstrate
                 that density may be improved by 2.9X, power efficiency
                 by 4.9X, throughput by 10X, and throughput per GB by
                 3.5X over a state-of-the-art server running optimized
                 Memcached. With Iridium we show that density may be
                 increased by 14X, power efficiency by 2.4X, and
                 throughput by 5.2X, while still meeting latency
                 requirements for a majority of requests.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS '14 conference proceedings.",
}

@Article{Nguyen:2014:DGD,
  author =       "Donald Nguyen and Andrew Lenharth and Keshav Pingali",
  title =        "Deterministic {Galois}: on-demand, portable and
                 parameterless",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "1",
  pages =        "499--512",
  month =        mar,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2654822.2541964",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:12:13 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Non-determinism in program execution can make program
                 development and debugging difficult. In this paper, we
                 argue that solutions to this problem should be
                 on-demand, portable and parameterless. On-demand means
                 that the programming model should permit the writing of
                 non-deterministic programs since these programs often
                 perform better than deterministic ones for the same
                 problem. Portable means that the program should produce
                 the same answer even if it is run on different
                 machines. Parameterless means that if there are
                 machine-dependent scheduling parameters that must be
                 tuned for good performance, they must not affect the
                 output. Although many solutions for deterministic
                 program execution have been proposed in the literature,
                 they fall short along one or more of these dimensions.
                 To remedy this, we propose a new approach, based on the
                 Galois programming model, in which (i) the programming
                 model permits the writing of non-deterministic programs
                 and (ii) the runtime system executes these programs
                 deterministically if needed. Evaluation of this
                 approach on a collection of benchmarks from the PARSEC,
                 PBBS, and Lonestar suites shows that it delivers
                 deterministic execution with substantially less
                 overhead than other systems in the literature.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS '14 conference proceedings.",
}

@Article{Ribic:2014:EEW,
  author =       "Haris Ribic and Yu David Liu",
  title =        "Energy-efficient work-stealing language runtimes",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "1",
  pages =        "513--528",
  month =        mar,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2654822.2541971",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:12:13 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Work stealing is a promising approach to constructing
                 multithreaded program runtimes of parallel programming
                 languages. This paper presents HERMES, an
                 energy-efficient work-stealing language runtime. The
                 key insight is that threads in a work-stealing
                 environment --- thieves and victims --- have varying
                 impacts on the overall program running time, and a
                 coordination of their execution ``tempo'' can lead to
                 energy efficiency with minimal performance loss. The
                 centerpiece of HERMES is two complementary algorithms
                 to coordinate thread tempo: the workpath-sensitive
                 algorithm determines tempo for each thread based on
                 thief-victim relationships on the execution path,
                 whereas the workload-sensitive algorithm selects
                 appropriate tempo based on the size of work-stealing
                 deques. We construct HERMES on top of Intel Cilk Plus's
                 runtime, and implement tempo adjustment through
                 standard Dynamic Voltage and Frequency Scaling (DVFS).
                 Benchmarks running on HERMES demonstrate an average of
                 11-12\% energy savings with an average of 3-4\%
                 performance loss through meter-based measurements over
                 commercial CPUs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS '14 conference proceedings.",
}

@Article{Mytkowicz:2014:DPF,
  author =       "Todd Mytkowicz and Madanlal Musuvathi and Wolfram
                 Schulte",
  title =        "Data-parallel finite-state machines",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "1",
  pages =        "529--542",
  month =        mar,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2654822.2541988",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:12:13 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "A finite-state machine (FSM) is an important
                 abstraction for solving several problems, including
                 regular-expression matching, tokenizing text, and
                 Huffman decoding. FSM computations typically involve
                 data-dependent iterations with unpredictable
                 memory-access patterns making them difficult to
                 parallelize. This paper describes a parallel algorithm
                 for FSMs that breaks dependences across iterations by
                 efficiently enumerating transitions from all possible
                 states on each input symbol. This allows the algorithm
                 to utilize various sources of data parallelism
                 available on modern hardware, including vector
                 instructions and multiple processors/cores. For
                 instance, on benchmarks from three FSM applications:
                 regular expressions, Huffman decoding, and HTML
                 tokenization, the parallel algorithm achieves up to a
                 3x speedup over optimized sequential baselines on a
                 single core, and linear speedups up to 21x on 8
                 cores.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS '14 conference proceedings.",
}

@Article{Zhao:2014:CES,
  author =       "Zhijia Zhao and Bo Wu and Xipeng Shen",
  title =        "Challenging the {``embarrassingly sequential''}:
                 parallelizing finite state machine-based computations
                 through principled speculation",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "1",
  pages =        "543--558",
  month =        mar,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2654822.2541989",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:12:13 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Finite-State Machine (FSM) applications are important
                 for many domains. But FSM computation is inherently
                 sequential, making such applications notoriously
                 difficult to parallelize. Most prior methods address
                 the problem through speculations on simple heuristics,
                 offering limited applicability and inconsistent
                 speedups. This paper provides some principled
                 understanding of FSM parallelization, and offers the
                 first disciplined way to exploit application-specific
                 information to inform speculations for parallelization.
                 Through a series of rigorous analysis, it presents a
                 probabilistic model that captures the relations between
                 speculative executions and the properties of the target
                 FSM and its inputs. With the formulation, it proposes
                 two model-based speculation schemes that automatically
                 customize themselves with the suitable configurations
                 to maximize the parallelization benefits. This rigorous
                 treatment yields near-linear speedup on applications
                 that state-of-the-art techniques can barely
                 accelerate.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS '14 conference proceedings.",
}

@Article{Zhou:2014:SAS,
  author =       "Yanqi Zhou and David Wentzlaff",
  title =        "The sharing architecture: sub-core configurability for
                 {IaaS} clouds",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "1",
  pages =        "559--574",
  month =        mar,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2654822.2541950",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:12:13 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Businesses and Academics are increasingly turning to
                 Infrastructure as a Service (IaaS) Clouds such as
                 Amazon's Elastic Compute Cloud (EC2) to fulfill their
                 computing needs. Unfortunately, current IaaS systems
                 provide a severely restricted pallet of rentable
                 computing options which do not optimally fit the
                 workloads that they are executing. We address this
                 challenge by proposing and evaluating a manycore
                 architecture, called the Sharing Architecture,
                 specifically optimized for IaaS systems by being
                 reconfigurable on a sub-core basis. The Sharing
                 Architecture enables better matching of workload to
                 micro-architecture resources by replacing static cores
                 with Virtual Cores which can be dynamically
                 reconfigured to have different numbers of ALUs and
                 amount of Cache. This reconfigurability enables many of
                 the same benefits of heterogeneous multicores, but in a
                 homogeneous fabric, and enables the reuse and resale of
                 resources on a per ALU or per KB of cache basis. The
                 Sharing Architecture leverages Distributed ILP
                 techniques, but is designed in a way to be independent
                 of recompilation. In addition, we introduce an economic
                 model which is enabled by the Sharing Architecture and
                 show how different users who have varying needs can be
                 better served by such a flexible architecture. We
                 evaluate the Sharing Architecture across a benchmark
                 suite of Apache, SPECint, and parts of PARSEC, and find
                 that it can achieve up to a 5x more economically
                 efficient market when compared to static architecture
                 multicores. We implemented the Sharing Architecture in
                 Verilog and present area overhead results.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS '14 conference proceedings.",
}

@Article{Waterland:2014:AAS,
  author =       "Amos Waterland and Elaine Angelino and Ryan P. Adams
                 and Jonathan Appavoo and Margo Seltzer",
  title =        "{ASC}: automatically scalable computation",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "1",
  pages =        "575--590",
  month =        mar,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2654822.2541985",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:12:13 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "We present an architecture designed to transparently
                 and automatically scale the performance of sequential
                 programs as a function of the hardware resources
                 available. The architecture is predicated on a model of
                 computation that views program execution as a walk
                 through the enormous state space composed of the memory
                 and registers of a single-threaded processor. Each
                 instruction execution in this model moves the system
                 from its current point in state space to a
                 deterministic subsequent point. We can parallelize such
                 execution by predictively partitioning the complete
                 path and speculatively executing each partition in
                 parallel. Accurately partitioning the path is a
                 challenging prediction problem. We have implemented our
                 system using a functional simulator that emulates the
                 x86 instruction set, including a collection of state
                 predictors and a mechanism for speculatively executing
                 threads that explore potential states along the
                 execution path. While the overhead of our simulation
                 makes it impractical to measure speedup relative to
                 native x86 execution, experiments on three benchmarks
                 show scalability of up to a factor of 256 on a 1024
                 core machine when executing unmodified sequential
                 programs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS '14 conference proceedings.",
}

@Article{Eyerman:2014:BSM,
  author =       "Stijn Eyerman and Lieven Eeckhout",
  title =        "The benefit of {SMT} in the multi-core era:
                 flexibility towards degrees of thread-level
                 parallelism",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "1",
  pages =        "591--606",
  month =        mar,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2654822.2541954",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:12:13 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "The number of active threads in a multi-core processor
                 varies over time and is often much smaller than the
                 number of supported hardware threads. This requires
                 multi-core chip designs to balance core count and
                 per-core performance. Low active thread counts benefit
                 from a few big, high-performance cores, while high
                 active thread counts benefit more from a sea of small,
                 energy-efficient cores. This paper comprehensively
                 studies the trade-offs in multi-core design given
                 dynamically varying active thread counts. We find that,
                 under these workload conditions, a homogeneous
                 multi-core processor, consisting of a few
                 high-performance SMT cores, typically outperforms
                 heterogeneous multi-cores consisting of a mix of big
                 and small cores (without SMT), within the same power
                 budget. We also show that a homogeneous multi-core
                 performs almost as well as a heterogeneous multi-core
                 that also implements SMT, as well as a dynamic
                 multi-core, while being less complex to design and
                 verify. Further, heterogeneous multi-cores that
                 power-gate idle cores yield (only) slightly better
                 energy-efficiency compared to homogeneous multi-cores.
                 The overall conclusion is that the benefit of SMT in
                 the multi-core era is to provide flexibility with
                 respect to the available thread-level parallelism.
                 Consequently, homogeneous multi-cores with big SMT
                 cores are competitive high-performance,
                 energy-efficient design points for workloads with
                 dynamically varying active thread counts.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS '14 conference proceedings.",
}

@Article{Ding:2014:FLE,
  author =       "Yufei Ding and Mingzhou Zhou and Zhijia Zhao and Sarah
                 Eisenstat and Xipeng Shen",
  title =        "Finding the limit: examining the potential and
                 complexity of compilation scheduling for {JIT}-based
                 runtime systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "1",
  pages =        "607--622",
  month =        mar,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2654822.2541945",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:12:13 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "This work aims to find out the full potential of
                 compilation scheduling for JIT-based runtime systems.
                 Compilation scheduling determines the order in which
                 the compilation units (e.g., functions) in a program
                 are to be compiled or recompiled. It decides when what
                 versions of the units are ready to run, and hence
                 affects performance. But it has been a largely
                 overlooked direction in JIT-related research, with some
                 fundamental questions left open: How significant
                 compilation scheduling is for performance, how good the
                 scheduling schemes employed by existing runtime systems
                 are, and whether a great potential exists for
                 improvement. This study proves the strong
                 NP-completeness of the problem, proposes a heuristic
                 algorithm that yields near optimal schedules, examines
                 the potential of two current scheduling schemes
                 empirically, and explores the relations with JIT
                 designs. It provides the first principled understanding
                 to the complexity and potential of compilation
                 scheduling, shedding some insights for JIT-based
                 runtime system improvement.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS '14 conference proceedings.",
}

@Article{Lupon:2014:SHS,
  author =       "Marc Lupon and Enric Gibert and Grigorios Magklis and
                 Sridhar Samudrala and Ra{\'u}l Mart{\'\i}nez and
                 Kyriakos Stavrou and David R. Ditzel",
  title =        "Speculative hardware\slash software co-designed
                 floating-point multiply-add fusion",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "1",
  pages =        "623--638",
  month =        mar,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2654822.2541978",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:12:13 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/fparith.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "A Fused Multiply-Add (FMA) instruction is currently
                 available in many general-purpose processors. It
                 increases performance by reducing latency of dependent
                 operations and increases precision by computing the
                 result as an indivisible operation with no intermediate
                 rounding. However, since the arithmetic behavior of a
                 single-rounding FMA operation is different than
                 independent FP multiply followed by FP add
                 instructions, some algorithms require significant
                 revalidation and rewriting efforts to work as expected
                 when they are compiled to operate with FMA --- a cost
                 that developers may not be willing to pay. Because of
                 that, abundant legacy applications are not able to
                 utilize FMA instructions. In this paper we propose a
                 novel HW/SW collaborative technique that is able to
                 efficiently execute workloads with increased
                 utilization of FMA, by adding the option to get the
                 same numerical result as separate FP multiply and FP
                 add pairs. In particular, we extended the host ISA of a
                 HW/SW co-designed processor with a new Combined
                 Multiply-Add (CMA) instruction that performs an FMA
                 operation with an intermediate rounding. This new
                 instruction is used by a transparent dynamic
                 translation software layer that uses a speculative
                 instruction-fusion optimization to transform FP
                 multiply and FP add sequences into CMA instructions.
                 The FMA unit has been slightly modified to support both
                 single-rounding and double-rounding fused instructions
                 without increasing their latency and to provide a
                 conservative fall-back path in case of misspeculation.
                 Evaluation on a cycle-accurate timing simulator showed
                 that CMA improved SPECfp performance by 6.3\% and
                 reduced executed instructions by 4.7\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS '14 conference proceedings.",
}

@Article{Schulte:2014:PCS,
  author =       "Eric Schulte and Jonathan Dorn and Stephen Harding and
                 Stephanie Forrest and Westley Weimer",
  title =        "Post-compiler software optimization for reducing
                 energy",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "1",
  pages =        "639--652",
  month =        mar,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2654822.2541980",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:12:13 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Modern compilers typically optimize for executable
                 size and speed, rarely exploring non-functional
                 properties such as power efficiency. These properties
                 are often hardware-specific, time-intensive to
                 optimize, and may not be amenable to standard dataflow
                 optimizations. We present a general post-compilation
                 approach called Genetic Optimization Algorithm (GOA),
                 which targets measurable non-functional aspects of
                 software execution in programs that compile to x86
                 assembly. GOA combines insights from profile-guided
                 optimization, superoptimization, evolutionary
                 computation and mutational robustness. GOA searches for
                 program variants that retain required functional
                 behavior while improving non-functional behavior, using
                 characteristic workloads and predictive modeling to
                 guide the search. The resulting optimizations are
                 validated using physical performance measurements and a
                 larger held-out test suite. Our experimental results on
                 PARSEC benchmark programs show average energy
                 reductions of 20\%, both for a large AMD system and a
                 small Intel system, while maintaining program
                 functionality on target workloads.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS '14 conference proceedings.",
}

@Article{Wood:2014:RSA,
  author =       "David A. Wood",
  title =        "Resolved: specialized architectures, languages, and
                 system software should supplant general-purpose
                 alternatives within a decade",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "1",
  pages =        "653--654",
  month =        mar,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2654822.2563369",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:12:13 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "The field of computing has struggled since its
                 inception with the tension between specialization and
                 generalization. Specialized architectures, programming
                 languages, and system software promise better
                 performance (across many metrics, including efficiency,
                 productivity, etc.) for workloads that match their
                 specialization objective. General-purpose
                 architectures, languages, and system software sacrifice
                 extremes of performance for specific workloads, seeking
                 acceptable performance across a much wider range. While
                 specialized alternatives have always had their place,
                 general-purpose architectures, languages, and system
                 software have dominated main-stream computing systems
                 for the past several decades. But with Dennard scaling
                 already gone and the end of Moore's Law looming, some
                 have argued that general-purpose computing platforms
                 must naturally give way to specialization. In this
                 debate, two teams of highly-opinionated experts will
                 debate the proposition that specialized architectures,
                 languages, and system software should largely supplant
                 general-purpose alternatives within the next decade.
                 Arguments in favor of specialization include energy
                 efficiency in the post-Dennard scaling era, performance
                 scaling in the post-Moore's law era, and improvements
                 in programmer productivity. Arguments against include
                 the large investment needed to create specialized
                 hardware and software components, lack of tools and
                 interfaces to create reusable components, the semantic
                 gap from overspecialization, and security
                 vulnerabilities and general correctness issues due to
                 interoperation of specialized components.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS '14 conference proceedings.",
}

@Article{Ruwase:2014:GHF,
  author =       "Olatunji Ruwase and Michael A. Kozuch and Phillip B.
                 Gibbons and Todd C. Mowry",
  title =        "{Guardrail}: a high fidelity approach to protecting
                 hardware devices from buggy drivers",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "1",
  pages =        "655--670",
  month =        mar,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2654822.2541970",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:12:13 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Device drivers are an Achilles' heel of modern
                 commodity operating systems, accounting for far too
                 many system failures. Previous work on driver
                 reliability has focused on protecting the kernel from
                 unsafe driver side-effects by interposing an
                 invariant-checking layer at the driver interface, but
                 otherwise treating the driver as a black box. In this
                 paper, we propose and evaluate Guardrail, which is a
                 more powerful framework for run-time driver analysis
                 that performs decoupled instruction-grain dynamic
                 correctness checking on arbitrary kernel-mode drivers
                 as they execute, thereby enabling the system to detect
                 and mitigate more challenging correctness bugs (e.g.,
                 data races, uninitialized memory accesses) that cannot
                 be detected by today's fault isolation techniques. Our
                 evaluation of Guardrail shows that it can find serious
                 data races, memory faults, and DMA faults in native
                 Linux drivers that required fixes, including previously
                 unknown bugs. Also, with hardware logging support,
                 Guardrail can be used for online protection of
                 persistent device state from driver bugs with at most
                 10\% overhead on the end-to-end performance of most
                 standard I/O workloads.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS '14 conference proceedings.",
}

@Article{Wood:2014:LLD,
  author =       "Benjamin P. Wood and Luis Ceze and Dan Grossman",
  title =        "Low-level detection of language-level data races with
                 {LARD}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "1",
  pages =        "671--686",
  month =        mar,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2654822.2541955",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:12:13 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Researchers have proposed always-on data-race
                 exceptions as a way to avoid the ill effects of data
                 races, but slow performance of accurate dynamic
                 data-race detection remains a barrier to the adoption
                 of always-on data-race exceptions. Proposals for
                 accurate low-level (e.g., hardware) data-race detection
                 have the potential to reduce this performance barrier.
                 This paper explains why low-level data-race detectors
                 are wrong for programs written in high-level languages
                 (e.g., Java): they miss true data races and report
                 false data races in these programs. To bring the
                 benefits of low-level data-race detection to high-level
                 languages, we design low-level abstractable race
                 detection (LARD), an extension of the interface between
                 low-level data-race detectors and run-time systems that
                 enables accurate language-level data-race detection
                 using low-level detection mechanisms. We implement
                 accurate LARD data-race exception support for Java,
                 coupling a modified Jikes RVM Java virtual machine and
                 a simulated hardware race detector. We evaluate our
                 detector's accuracy against an accurate dynamic Java
                 data-race detector and other low-level race detectors
                 without LARD, showing that naive accurate low-level
                 data-race detectors suffer from many missed and false
                 language-level races in practice, and that LARD
                 prevents this inaccuracy.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS '14 conference proceedings.",
}

@Article{Zhang:2014:EES,
  author =       "Jiaqi Zhang and Lakshminarayanan Renganarayana and
                 Xiaolan Zhang and Niyu Ge and Vasanth Bala and Tianyin
                 Xu and Yuanyuan Zhou",
  title =        "{EnCore}: exploiting system environment and
                 correlation information for misconfiguration
                 detection",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "1",
  pages =        "687--700",
  month =        mar,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2654822.2541983",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:12:13 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "As software systems become more complex and
                 configurable, failures due to misconfigurations are
                 becoming a critical problem. Such failures often have
                 serious functionality, security and financial
                 consequences. Further, diagnosis and remediation for
                 such failures require reasoning across the software
                 stack and its operating environment, making it
                 difficult and costly. We present a framework and tool
                 called EnCore to automatically detect software
                 misconfigurations. EnCore takes into account two
                 important factors that are unexploited before: the
                 interaction between the configuration settings and the
                 executing environment, as well as the rich correlations
                 between configuration entries. We embrace the emerging
                 trend of viewing systems as data, and exploit this to
                 extract information about the execution environment in
                 which a configuration setting is used. EnCore learns
                 configuration rules from a given set of sample
                 configurations. With training data enriched with the
                 execution context of configurations, EnCore is able to
                 learn a broad set of configuration anomalies that spans
                 the entire system. EnCore is effective in detecting
                 both injected errors and known real-world problems ---
                 it finds 37 new misconfigurations in Amazon EC2 public
                 images and 24 new configuration problems in a
                 commercial private cloud. By systematically exploiting
                 environment information and by learning correlation
                 rules across multiple configuration settings, EnCore
                 detects 1.6x to 3.5x more misconfiguration anomalies
                 than previous approaches.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS '14 conference proceedings.",
}

@Article{Voskuilen:2014:HPF,
  author =       "Gwendolyn Voskuilen and T. N. Vijaykumar",
  title =        "High-performance fractal coherence",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "1",
  pages =        "701--714",
  month =        mar,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2654822.2541982",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:12:13 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Bugs in cache coherence protocols can cause system
                 failures. Despite many advances, verification runs into
                 state explosion for even moderately-sized systems. As
                 multicores' core counts increase, coherence
                 verifiability continues to be a key problem. A recent
                 proposal, called fractal coherence, avoids the state
                 explosion problem by applying the idea of observational
                 equivalence between a larger system and its smaller
                 sub-systems. A fractal protocol for a larger system is
                 verified by design if a minimal sub-system is verified
                 completely. While fractal coherence is a significant
                 step forward, there are two shortcomings: (1)
                 Architectural limitation: To achieve fractal
                 coherence's logical hierarchy, TreeFractal, the
                 specific fractal protocol, employs a tree architecture
                 where each miss traverses many levels up and down the
                 tree and each level redundantly holds its sub-trees'
                 coherence tags. (2) Protocol restrictions: TreeFractal
                 imposes a restriction on responses to read requests
                 that forces read requests to obtain clean blocks from
                 the nearest sharer even if the shared L2 or L3 is
                 faster. These limitations impose significant
                 performance and coherence tag state overheads. In this
                 paper, we propose architectural support for coherence
                 protocols to achieve scalable performance and
                 verifiability. To address the architectural limitation,
                 we propose FlatFractal, a directory-based architecture
                 which decouples fractal coherence's logical hierarchy
                 from the architecture and eliminates redundant tag
                 state. To address the protocol restriction, we propose
                 a simple change to the protocol that, while preserving
                 observational equivalence, allows read requests to
                 obtain the blocks from the shared L2 or L3. Our
                 simulations show that for 16 cores, FlatFractal
                 performs, on average, 57\% better than TreeFractal and
                 within 3\% of a conventional directory.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS '14 conference proceedings.",
}

@Article{Kwon:2014:LOC,
  author =       "Woo-Cheol Kwon and Tushar Krishna and Li-Shiuan Peh",
  title =        "Locality-oblivious cache organization leveraging
                 single-cycle multi-hop {NoCs}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "1",
  pages =        "715--728",
  month =        mar,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2654822.2541976",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:12:13 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Locality has always been a critical factor in on-chip
                 data placement on CMPs as accessing further-away caches
                 has in the past been more costly than accessing nearby
                 ones. Substantial research on locality-aware designs
                 have thus focused on keeping a copy of the data
                 private. However, this complicates the problem of data
                 tracking and search/invalidation; tracking the state of
                 a line at all on-chip caches at a directory or
                 performing full-chip broadcasts are both non-scalable
                 and extremely expensive solutions. In this paper, we
                 make the case for Locality-Oblivious Cache Organization
                 (LOCO), a CMP cache organization that leverages the
                 on-chip network to create virtual single-cycle paths
                 between distant caches, thus redefining the notion of
                 locality. LOCO is a clustered cache organization,
                 supporting both homogeneous and heterogeneous cluster
                 sizes, and provides near single-cycle accesses to data
                 anywhere within the cluster, just like a private cache.
                 Globally, LOCO dynamically creates a virtual mesh
                 connecting all the clusters, and performs an efficient
                 global data search and migration over this virtual
                 mesh, without having to resort to full-chip broadcasts
                 or perform expensive directory lookups. Trace-driven
                 and full system simulations running SPLASH-2 and PARSEC
                 benchmarks show that LOCO improves application run time
                 by up to 44.5\% over baseline private and shared
                 cache.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS '14 conference proceedings.",
}

@Article{Kasture:2014:UEC,
  author =       "Harshad Kasture and Daniel Sanchez",
  title =        "{Ubik}: efficient cache sharing with strict {QoS} for
                 latency-critical workloads",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "1",
  pages =        "729--742",
  month =        mar,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2654822.2541944",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:12:13 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Chip-multiprocessors (CMPs) must often execute
                 workload mixes with different performance requirements.
                 On one hand, user-facing, latency-critical applications
                 (e.g., web search) need low tail (i.e., worst-case)
                 latencies, often in the millisecond range, and have
                 inherently low utilization. On the other hand,
                 compute-intensive batch applications (e.g., MapReduce)
                 only need high long-term average performance. In
                 current CMPs, latency-critical and batch applications
                 cannot run concurrently due to interference on shared
                 resources. Unfortunately, prior work on quality of
                 service (QoS) in CMPs has focused on guaranteeing
                 average performance, not tail latency. In this work, we
                 analyze several latency-critical workloads, and show
                 that guaranteeing average performance is insufficient
                 to maintain low tail latency, because
                 microarchitectural resources with state, such as caches
                 or cores, exert inertia on instantaneous workload
                 performance. Last-level caches impart the highest
                 inertia, as workloads take tens of milliseconds to warm
                 them up. When left unmanaged, or when managed with
                 conventional QoS frameworks, shared last-level caches
                 degrade tail latency significantly. Instead, we propose
                 Ubik, a dynamic partitioning technique that predicts
                 and exploits the transient behavior of latency-critical
                 workloads to maintain their tail latency while
                 maximizing the cache space available to batch
                 applications. Using extensive simulations, we show
                 that, while conventional QoS frameworks degrade tail
                 latency by up to 2.3x, Ubik simultaneously maintains
                 the tail latency of latency-critical workloads and
                 significantly improves the performance of batch
                 applications.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS '14 conference proceedings.",
}

@Article{Pichai:2014:ASA,
  author =       "Bharath Pichai and Lisa Hsu and Abhishek
                 Bhattacharjee",
  title =        "Architectural support for address translation on
                 {GPUs}: designing memory management units for
                 {CPU\slash GPUs} with unified address spaces",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "1",
  pages =        "743--758",
  month =        mar,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2654822.2541942",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Sep 4 07:12:13 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "The proliferation of heterogeneous compute platforms,
                 of which CPU/GPU is a prevalent example, necessitates a
                 manageable programming model to ensure widespread
                 adoption. A key component of this is a shared unified
                 address space between the heterogeneous units to obtain
                 the programmability benefits of virtual memory. To this
                 end, we are the first to explore GPU Memory Management
                 Units(MMUs) consisting of Translation Lookaside Buffers
                 (TLBs) and page table walkers (PTWs) for address
                 translation in unified heterogeneous systems. We show
                 the performance challenges posed by GPU warp schedulers
                 on TLBs accessed in parallel with L1 caches, which
                 provide many well-known programmability benefits. In
                 response, we propose modest TLB and PTW augmentations
                 that recover most of the performance lost by
                 introducing L1 parallel TLB access. We also show that a
                 little TLB-awareness can make other GPU performance
                 enhancements (e.g., cache-conscious warp scheduling and
                 dynamic warp formation on branch divergence) feasible
                 in the face of cache-parallel address translation,
                 bringing overheads in the range deemed acceptable for
                 CPUs (10-15\\% of runtime). We presume this initial
                 design leaves room for improvement but anticipate that
                 our bigger insight, that a little TLB-awareness goes a
                 long way in GPUs, will spur further work in this
                 fruitful area.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS '14 conference proceedings.",
}

@Article{Mondal:2014:DSM,
  author =       "Subijit Mondal and Subhashis Maitra",
  title =        "Data security-modified {AES} algorithm and its
                 applications",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "2",
  pages =        "1--8",
  month =        may,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2669594.2669596",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Sep 15 16:43:20 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Now a days with the rapid development of multimedia
                 technologies, research on safety and security are
                 becoming more important. Multimedia data are generated
                 and transmitted through the communication channels and
                 the wireless media. The efficiencies of encryption
                 based on different existing algorithms are not up to
                 the satisfactory limit. Hence researchers are trying to
                 modify the existing algorithm or even develop new
                 algorithms that help to increase security with a little
                 encryption time. Here in this paper, we have furnished
                 a new technology to modify the AES algorithm which
                 gives more security with a little encryption time and
                 which can be used to encrypt using 128-bit key.
                 Theoretical analysis on the proposed algorithm with the
                 existing reveals the novelty of our work. Here we have
                 proposed a technique to randomize the key and hidden
                 the key data into an encrypted digital image using the
                 basics concept of cryptography and also using the
                 concept of digital watermarking, the concept of
                 key-hide has also been encrypted. We have also proposed
                 a new technique to reposition the pixels to break the
                 correlation between them. So, the proposed scheme
                 offers a more secure and cost effective mechanism for
                 encryption.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Sen:2014:TLT,
  author =       "Soumik Sen and Subhashis Maitra",
  title =        "Three levels three dimensional compact coding",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "2",
  pages =        "9--14",
  month =        may,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2669594.2669597",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Sep 15 16:43:20 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Hardware and timing complexities are the major issues
                 in current security related algorithms. Some of them
                 shows better efficiency with respect to time and some
                 of them reduce hardware complexities. Researchers try
                 to solve both the problem at the same time in an
                 efficient way. There are different existing algorithms
                 which prove this efficiency. Here we will propose a new
                 algorithm named as ``Three Levels Three Dimensional
                 Compact Coding (TLTDCC)'' which will show better
                 response time as well as it requires less hardware and
                 also in security aspect, it will provide higher
                 security. This paper explores a novelty of the work
                 through a comparative study of the proposed algorithm
                 with respect to different existing algorithms both in
                 tabular method and graphically.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Thomasian:2014:BDA,
  author =       "Alexander Thomasian and Bingxing Liu and Yuhui Deng",
  title =        "Balancing disk access times in {RAID5} disk arrays in
                 degraded mode by conditionally prioritizing fork\slash
                 join requests",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "2",
  pages =        "15--19",
  month =        may,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2669594.2669598",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Sep 15 16:43:20 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "RAID5 disk arrays with rotated parities can tolerate
                 single disk failures by reconstructing missing blocks
                 on demand by XORing the contents of corresponding $K$
                 blocks on surviving disks by a $K$-way Fork/Join ( F/J
                 ) request, which is considered completed after the $K$
                 disks are accessed. $ F / J$ accesses in RAID5 are
                 processed concurrently with interfering disk accesses.
                 The mean response time of F/J and
                 independent/interfering requests: $ R^{F / J}$ /$_K$
                 and $ R^{\rm Ind}$ and the mean delay from the
                 completion of the first to the last $ F / J$ task,
                 known as task dispersion time: $ T^{\mr disp} /_K$, are
                 performance metrics of interest. Given $ R^{F / J} /_K
                 > R^{\rm Ind}$ with FCFS scheduling, it is desirable to
                 equalize disk access times, but giving a higher
                 nonpreemptive priority to disk accesses due to $ F / J$
                 requests with respect to interfering disk accesses
                 results in $ R^{\rm Ind}$ \& $ R^{F / J} /_K$. We
                 propose a continuum of conditional priority methods
                 based on the fraction $F$ of $ F / J$ accesses
                 completed with FCFS scheduling. $ F = \infty $ stands
                 for FCFS and $ F = 0$ stands for unconditional
                 priorities. Simulation shows that $ F = 1 / 8$ with $ K
                 = 8$ yields $ R^{F / J} /_K \approx R^_{Ind}$ for three
                 distributions of disk requests and in the range of $ F
                 / J$ and independent disk requests considered. $F$ can
                 be varied adaptively based on measurement results to
                 balance disk access times.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Gandhi:2014:BTI,
  author =       "Jayneel Gandhi and Arkaprava Basu and Mark D. Hill and
                 Michael M. Swift",
  title =        "{BadgerTrap}: a tool to instrument x86-64 {TLB}
                 misses",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "2",
  pages =        "20--23",
  month =        may,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2669594.2669599",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Sep 15 16:43:20 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "The overheads of memory management units (MMUs) have
                 gained importance in today's systems. Detailed
                 simulators may be too slow to gain insights into
                 micro-architectural techniques that improve MMU
                 efficiency. To address this issue, we propose a novel
                 tool, BadgerTrap, which allows online instrumentation
                 of TLB misses. It allows first-order analysis of new
                 hardware techniques to improve MMU efficiency. The tool
                 helps to create and analyze x86-64 TLB miss trace. We
                 describe example studies to show various ways this tool
                 can be applied to gain new research insights.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Thorson:2014:INa,
  author =       "Mark Thorson",
  title =        "{Internet} nuggets",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "2",
  pages =        "24--36",
  month =        may,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2669594.2669601",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Sep 15 16:43:20 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Towles:2014:UCI,
  author =       "Brian Towles and J. P. Grossman and Brian Greskamp and
                 David E. Shaw",
  title =        "Unifying on-chip and inter-node switching within the
                 {Anton 2} network",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "3",
  pages =        "1--12",
  month =        jun,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2678373.2665677",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Dec 3 16:18:50 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "The design of network architectures has become
                 increasingly complex as the chips connected by
                 inter-node networks have emerged as distributed systems
                 in their own right, complete with their own on-chip
                 networks. In Anton 2, a massively parallel
                 special-purpose supercomputer for molecular dynamics
                 simulations, we managed this complexity by reusing the
                 on-chip network as a switch for inter-node traffic.
                 This unified network approach introduces several design
                 challenges. Maintaining fairness within the inter-node
                 network is difficult, as each hop becomes a sequence of
                 many on-chip routing decisions. We addressed this
                 problem with an inverse-weighted arbiter that ensures
                 fairness with low implementation costs. Balancing the
                 load of inter-node traffic across the on-chip network
                 is also critical, and we adopted an optimization
                 approach to design an appropriate routing algorithm.
                 Finally, the on-chip routers carry inter-node traffic,
                 so they must implement inter-node virtual channels to
                 avoid deadlock. In order to keep the routers small and
                 fast, we developed a deadlock-free routing algorithm
                 that reduces the number of virtual channels by
                 one-third relative to previous approaches. The
                 resulting Anton 2 network implementation efficiently
                 utilizes its inter-node channels and provides low
                 messaging latency, while occupying a modest amount of
                 silicon area",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '14 conference proceedings.",
}

@Article{Putnam:2014:RFA,
  author =       "Andrew Putnam and Adrian M. Caulfield and Eric S.
                 Chung and Derek Chiou and Kypros Constantinides and
                 John Demme and Hadi Esmaeilzadeh and Jeremy Fowers and
                 Gopi Prashanth and Gopal Jan and Gray Michael and
                 Haselman Scott Hauck and Stephen Heil and Amir Hormati
                 and Joo-Young Kim and Sitaram Lanka and James Larus and
                 Eric Peterson and Simon Pope and Aaron Smith and Jason
                 Thong and Phillip Yi and Xiao Doug Burger",
  title =        "A reconfigurable fabric for accelerating large-scale
                 datacenter services",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "3",
  pages =        "13--24",
  month =        jun,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2678373.2665678",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Dec 3 16:18:50 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Datacenter workloads demand high computational
                 capabilities, flexibility, power efficiency, and low
                 cost. It is challenging to improve all of these factors
                 simultaneously. To advance datacenter capabilities
                 beyond what commodity server designs can provide, we
                 have designed and built a composable, reconfigurable
                 fabric to accelerate portions of large-scale software
                 services. Each instantiation of the fabric consists of
                 a 6x8 2-D torus of high-end Stratix V FPGAs embedded
                 into a half-rack of 48 machines. One FPGA is placed
                 into each server, accessible through PCIe, and wired
                 directly to other FPGAs with pairs of 10 Gb SAS cables
                 In this paper, we describe a medium-scale deployment of
                 this fabric on a bed of 1,632 servers, and measure its
                 efficacy in accelerating the Bing web search engine. We
                 describe the requirements and architecture of the
                 system, detail the critical engineering challenges and
                 solutions needed to make the system robust in the
                 presence of failures, and measure the performance,
                 power, and resilience of the system when ranking
                 candidate documents. Under high load, the large-scale
                 reconfigurable fabric improves the ranking throughput
                 of each server by a factor of 95\% for a fixed latency
                 distribution --- or, while maintaining equivalent
                 throughput, reduces the tail latency by 29\%",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '14 conference proceedings.",
}

@Article{Daya:2014:SCR,
  author =       "Bhavya K. Daya and Chia-Hsin Owen Chen and Suvinay
                 Subramanian and Woo-Cheol Kwon and Sunghyun Park and
                 Tushar Krishna and Jim Holt and Anantha P. Chandrakasan
                 and Li-Shiuan Peh",
  title =        "{SCORPIO}: a $ 36$-core research chip demonstrating
                 snoopy coherence on a scalable mesh {NoC} with
                 in-network ordering",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "3",
  pages =        "25--36",
  month =        jun,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2678373.2665680",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Dec 3 16:18:50 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "In the many-core era, scalable coherence and on-chip
                 interconnects are crucial for shared memory processors.
                 While snoopy coherence is common in small multicore
                 systems, directory-based coherence is the de facto
                 choice for scalability to many cores, as snoopy relies
                 on ordered interconnects which do not scale. However,
                 directory-based coherence does not scale beyond tens of
                 cores due to excessive directory area overhead or
                 inaccurate sharer tracking. Prior techniques supporting
                 ordering on arbitrary unordered networks are
                 impractical for full multicore chip designs We present
                 SCORPIO, an ordered mesh Network-on-Chip (NoC)
                 architecture with a separate fixed-latency, bufferless
                 network to achieve distributed global ordering. Message
                 delivery is decoupled from the ordering, allowing
                 messages to arrive in any order and at any time, and
                 still be correctly ordered. The architecture is
                 designed to plug-and-play with existing multicore IP
                 and with practicality, timing, area, and power as top
                 concerns. Full-system 36 and 64-core simulations on
                 SPLASH-2 and PARSEC benchmarks show an average
                 application runtime reduction of 24.1\% and 12.9\%, in
                 comparison to distributed directory and AMD
                 HyperTransport coherence protocols, respectively The
                 SCORPIO architecture is incorporated in an 11
                 mm-by-13mm chip prototype, fabricated in IBM 45nm SOI
                 technology, comprising 36 Freescale e200 Power
                 Architecture\TM{} cores with private L1 and L2 caches
                 interfacing with the NoC via ARM AMBA, along with two
                 Cadence on-chip DDR2 controllers. The chip prototype
                 achieves a post synthesis operating frequency of 1 GHz
                 (833MHz post-layout) with an estimated power of 28.8W
                 (768mW per tile), while the network consumes only 10\%
                 of tile area and 19 \% of tile power.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '14 conference proceedings.",
}

@Article{Upasani:2014:ACD,
  author =       "Gaurang Upasani and Xavier Vera and Antonio
                 Gonz{\'a}lez",
  title =        "Avoiding core's {DUE \& SDC} via acoustic wave
                 detectors and tailored error containment and recovery",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "3",
  pages =        "37--48",
  month =        jun,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2678373.2665682",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Dec 3 16:18:50 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "The trend of downsizing transistors and operating
                 voltage scaling has made the processor chip more
                 sensitive against radiation phenomena making soft
                 errors an important challenge. New reliability
                 techniques for handling soft errors in the logic and
                 memories that allow meeting the desired
                 failures-in-time (FIT) target are key to keep
                 harnessing the benefits of Moore's law. The failure to
                 scale the soft error rate caused by particle strikes,
                 may soon limit the total number of cores that one may
                 have running at the same time This paper proposes a
                 light-weight and scalable architecture to eliminate
                 silent data corruption errors (SDC) and detected
                 unrecoverable errors (DUE) of a core. The architecture
                 uses acoustic wave detectors for error detection. We
                 propose to recover by confining the errors in the cache
                 hierarchy, allowing us to deal with the relatively long
                 detection latencies. Our results show that the proposed
                 mechanism protects the whole core (logic, latches and
                 memory arrays) incurring performance overhead as low as
                 0.60\%",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '14 conference proceedings.",
}

@Article{Chen:2014:MLC,
  author =       "Long Chen and Zhao Zhang",
  title =        "{MemGuard}: a low cost and energy efficient design to
                 support and enhance memory system reliability",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "3",
  pages =        "49--60",
  month =        jun,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2678373.2665683",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Dec 3 16:18:50 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/hash.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Memory system reliability is increasingly a concern as
                 memory cell density and capacity continue to grow. The
                 conventional approach is to use redundant memory bits
                 for error detection and correction, with significant
                 storage, cost and power overheads. In this paper, we
                 propose a novel, system-level scheme called MemGuard
                 for memory error detection. With OS-based
                 checkpointing, it is also able to recover program
                 execution from memory errors. The memory error
                 detection of MemGuard is motivated by memory integrity
                 verification using log hashes. It is much stronger than
                 SECDED in error detection, incurs negligible hardware
                 cost and energy overhead and no storage overhead, and
                 is compatible with various memory organizations. It may
                 play the role of ECC memory in consumer-level computers
                 and mobile devices, without the shortcomings of ECC
                 memory. In server computers, it may complement SECDED
                 ECC or Chipkill Correct by providing even stronger
                 error detection. We have comprehensively investigated
                 and evaluated the feasibility and reliability of
                 MemGuard. We show that using an incremental multiset
                 hash function and a non-cryptographic hash function,
                 the performance and energy overheads of MemGuard are
                 negligible. We use the mathematical deduction and
                 synthetic simulation to prove that MemGuard is robust
                 and reliable.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '14 conference proceedings.",
}

@Article{Hari:2014:GGE,
  author =       "Siva Kumar Sastry Hari and Radha Venkatagiri and
                 Sarita V. Adve and Helia Naeimi",
  title =        "{GangES}: gang error simulation for hardware
                 resiliency evaluation",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "3",
  pages =        "61--72",
  month =        jun,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2678373.2665685",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Dec 3 16:18:50 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "As technology scales, the hardware reliability
                 challenge affects a broad computing market, rendering
                 traditional redundancy based solutions too expensive.
                 Software anomaly based hardware error detection has
                 emerged as a low cost reliability solution, but suffers
                 from Silent Data Corruptions (SDCs). It is crucial to
                 accurately evaluate SDC rates and identify SDC
                 producing software locations to develop
                 software-centric low-cost hardware resiliency
                 solutions. A recent tool, called Relyzer,
                 systematically analyzes an entire application's
                 resiliency to single bit soft-errors using a small set
                 of carefully selected error injection sites. Relyzer
                 provides a practical resiliency evaluation mechanism
                 but still requires significant evaluation time, most of
                 which is spent on error simulations. This paper
                 presents a new technique called GangES (Gang Error
                 Simulator) that aims to reduce error simulation time.
                 GangES observes that a set or gang of error simulations
                 that result in the same intermediate execution state
                 (after their error injections) will produce the same
                 error outcome; therefore, only one simulation of the
                 gang needs to be completed, resulting in significant
                 overall savings in error simulation time. GangES
                 leverages program structure to carefully select when to
                 compare simulations and what state to compare. For our
                 workloads, GangES saves 57\% of the total error
                 simulation time with an overhead of just 1.6\% This
                 paper also explores pure program analyses based
                 techniques that could obviate the need for tools such
                 as GangES altogether. The availability of
                 Relyzer+GangES allows us to perform a detailed
                 evaluation of such techniques. We evaluate the accuracy
                 of several previously proposed program metrics. We find
                 that the metrics we considered and their various linear
                 combinations are unable to adequately predict an
                 instruction's vulnerability to SDCs, further motivating
                 the use of Relyzer+GangES style techniques as valuable
                 solutions for the hardware error resiliency evaluation
                 problem",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '14 conference proceedings.",
}

@Article{Wadden:2014:RWD,
  author =       "Jack Wadden and Alexander Lyashevsky and Sudhanva
                 Gurumurthi and Vilas Sridharan and Kevin Skadron",
  title =        "Real-world design and evaluation of compiler-managed
                 {GPU} redundant multithreading",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "3",
  pages =        "73--84",
  month =        jun,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2678373.2665686",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Dec 3 16:18:50 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Reliability for general purpose processing on the GPU
                 (GPGPU) is becoming a weak link in the construction of
                 reliable supercomputer systems. Because hardware
                 protection is expensive to develop, requires dedicated
                 on-chip resources, and is not portable across different
                 architectures, the efficiency of software solutions
                 such as redundant multithreading (RMT) must be
                 explored. This paper presents a real-world design and
                 evaluation of automatic software RMT on GPU hardware.
                 We first describe a compiler pass that automatically
                 converts GPGPU kernels into redundantly threaded
                 versions. We then perform detailed power and
                 performance evaluations of three RMT algorithms, each
                 of which provides fault coverage to a set of structures
                 in the GPU. Using real hardware, we show that
                 compiler-managed software RMT has highly variable
                 costs. We further analyze the individual costs of
                 redundant work scheduling, redundant computation, and
                 inter-thread communication, showing that no single
                 component in general is responsible for high overheads
                 across all applications; instead, certain workload
                 properties tend to cause RMT to perform well or poorly.
                 Finally, we demonstrate the benefit of architectural
                 support for RMT with a specific example of fast,
                 register-level thread communication",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '14 conference proceedings.",
}

@Article{Chen:2014:ARA,
  author =       "Tianshi Chen and Qi Guo and Ke Tang and Olivier Temam
                 and Zhiwei Xu and Zhi-Hua Zhou and Yunji Chen",
  title =        "{ArchRanker}: a ranking approach to design space
                 exploration",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "3",
  pages =        "85--96",
  month =        jun,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2678373.2665688",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Dec 3 16:18:50 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Architectural Design Space Exploration (DSE) is a
                 notoriously difficult problem due to the exponentially
                 large size of the design space and long simulation
                 times. Previously, many studies proposed to formulate
                 DSE as a regression problem which predicts architecture
                 responses (e.g., time, power) of a given architectural
                 configuration. Several of these techniques achieve high
                 accuracy, though often at the cost of significant
                 simulation time for training the regression models. We
                 argue that the information the architect mostly needs
                 during the DSE process is whether a given configuration
                 will perform better than another one in the presences
                 of design constraints, or better than any other one
                 seen so far, rather than precisely estimating the
                 performance of that configuration. Based on this
                 observation, we propose a novel ranking-based approach
                 to DSE where we train a model to predict which of two
                 architecture configurations will perform best. We show
                 that, not only this ranking model more accurately
                 predicts the relative merit of two architecture
                 configurations than an ANN-based state-of-the-art
                 regression model, but also that it requires much fewer
                 training simulations to achieve the same accuracy, or
                 that it can be used for and is even better at
                 quantifying the performance gap between two
                 configurations We implement the framework for training
                 and using this model, called ArchRanker, and we
                 evaluate it on several DSE scenarios (unicore/multicore
                 design spaces, and both time and power performance
                 metrics). We try to emulate as closely as possible the
                 DSE process by creating constraint-based scenarios, or
                 an iterative DSE process. We find that ArchRanker makes
                 29:68\% to 54:43\% fewer incorrect predictions on
                 pairwise relative merit of configurations (tested with
                 79,800 configuration pairs) than an ANN-based
                 regression model across all DSE scenarios considered
                 (values averaged over all benchmarks for each
                 scenario). We also find that, to achieve the same
                 accuracy as ArchRanker, the ANN often requires three
                 times more training simulations",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '14 conference proceedings.",
}

@Article{Shao:2014:APR,
  author =       "Yakun Sophia Shao and Brandon Reagen and Gu-Yeon Wei
                 and David Brooks",
  title =        "{Aladdin}: a {Pre-RTL}, power-performance accelerator
                 simulator enabling large design space exploration of
                 customized architectures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "3",
  pages =        "97--108",
  month =        jun,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2678373.2665689",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Dec 3 16:18:50 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Hardware specialization, in the form of accelerators
                 that provide custom datapath and control for specific
                 algorithms and applications, promises impressive
                 performance and energy advantages compared to
                 traditional architectures. Current research in
                 accelerator analysis relies on RTL-based synthesis
                 flows to produce accurate timing, power, and area
                 estimates. Such techniques not only require significant
                 effort and expertise but are also slow and tedious to
                 use, making large design space exploration infeasible.
                 To overcome this problem, we present Aladdin, a
                 pre-RTL, power-performance accelerator modeling
                 framework and demonstrate its application to
                 system-on-chip (SoC) simulation. Aladdin estimates
                 performance, power, and area of accelerators within
                 0.9\%, 4.9\%, and 6.6\% with respect to RTL
                 implementations. Integrated with architecture-level
                 core and memory hierarchy simulators, Aladdin provides
                 researchers an approach to model the power and
                 performance of accelerators in an SoC environment",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '14 conference proceedings.",
}

@Article{Badr:2014:SST,
  author =       "Mario Badr and Natalie Enright Jerger",
  title =        "{SynFull}: synthetic traffic models capturing cache
                 coherent behaviour",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "3",
  pages =        "109--120",
  month =        jun,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2678373.2665691",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Dec 3 16:18:50 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Modern and future many-core systems represent complex
                 architectures. The communication fabrics of these large
                 systems heavily influence their performance and power
                 consumption. Current simulation methodologies for
                 evaluating networks-on-chip (NoCs) are not keeping pace
                 with the increased complexity of our systems;
                 architects often want to explore many different design
                 knobs quickly. Methodologies that capture workload
                 trends with faster simulation times are highly
                 beneficial at early stages of architectural
                 exploration. We propose SynFull, a synthetic traffic
                 generation methodology that captures both application
                 and cache coherence behaviour to rapidly evaluate NoCs.
                 SynFull allows designers to quickly indulge in detailed
                 performance simulations without the cost of
                 long-running full-system simulation. By capturing a
                 full range of application and coherence behaviour,
                 architects can avoid the over- or under-design of the
                 network as may occur when using traditional synthetic
                 traffic patterns such as uniform random. SynFull has
                 errors as low as 0.3\% and provides 50x speedup on
                 average over full-system simulation",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '14 conference proceedings.",
}

@Article{Venkat:2014:HID,
  author =       "Ashish Venkat and Dean M. Tullsen",
  title =        "Harnessing {ISA} diversity: design of a
                 {heterogeneous-ISA} chip multiprocessor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "3",
  pages =        "121--132",
  month =        jun,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2678373.2665692",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Dec 3 16:18:50 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Heterogeneous multicore architectures have the
                 potential for high performance and energy efficiency.
                 These architectures may be composed of small
                 power-efficient cores, large high-performance cores,
                 and/or specialized cores that accelerate the
                 performance of a particular class of computation.
                 Architects have explored multiple dimensions of
                 heterogeneity, both in terms of micro-architecture and
                 specialization. While early work constrained the cores
                 to share a single ISA, this work shows that allowing
                 heterogeneous ISAs further extends the effectiveness of
                 such architectures This work exploits the diversity
                 offered by three modern ISAs: Thumb, x86-64, and Alpha.
                 This architecture has the potential to outperform the
                 best single-ISA heterogeneous architecture by as much
                 as 21\%, with 23\% energy savings and a reduction of
                 32\% in Energy Delay Product.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '14 conference proceedings.",
}

@Article{Sembrant:2014:DDD,
  author =       "Andreas Sembrant and Erik Hagersten and David
                 Black-Schaffer",
  title =        "The {Direct-to-Data (D2D)} cache: navigating the cache
                 hierarchy with a single lookup",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "3",
  pages =        "133--144",
  month =        jun,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2678373.2665694",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Dec 3 16:18:50 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Modern processors optimize for cache energy and
                 performance by employing multiple levels of caching
                 that address bandwidth, low-latency and high-capacity.
                 A request typically traverses the cache hierarchy,
                 level by level, until the data is found, thereby
                 wasting time and energy in each level. In this paper,
                 we present the Direct-to-Data (D2D) cache that locates
                 data across the entire cache hierarchy with a single
                 lookup. To navigate the cache hierarchy, D2D extends
                 the TLB with per cache-line location information that
                 indicates in which cache and way the cache line is
                 located. This allows the D2D cache to: (1) skip levels
                 in the hierarchy (by accessing the right cache level
                 directly), (2) eliminate extra data array reads (by
                 reading the right way directly), (3) avoid tag
                 comparisons (by eliminating the tag arrays), and (4) go
                 directly to DRAM on cache misses (by checking the TLB).
                 This reduces the L2 latency by 40\% and saves 5-17\% of
                 the total cache hierarchy energy D2D's lower L2 latency
                 directly improves L2 sensitive applications'
                 performance by 5-14\%. More significantly, we can take
                 advantage of the L2 latency reduction to optimize other
                 parts of the micro-architecture. For example, we can
                 reduce the ROB size for the L2 bound applications by
                 25\%, or we can reduce the L1 cache size, delivering an
                 overall 21\% energy savings across all benchmarks,
                 without hurting performance.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '14 conference proceedings.",
}

@Article{Arelakis:2014:SSC,
  author =       "Angelos Arelakis and Per Stenstrom",
  title =        "{SC2}: a statistical compression cache scheme",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "3",
  pages =        "145--156",
  month =        jun,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2678373.2665696",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Dec 3 16:18:50 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/datacompression.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Low utilization of on-chip cache capacity limits
                 performance and wastes energy because of the long
                 latency, limited bandwidth, and energy consumption
                 associated with off-chip memory accesses. Value
                 replication is an important source of low capacity
                 utilization. While prior cache compression techniques
                 manage to code frequent values densely, they trade off
                 a high compression ratio for low decompression latency,
                 thus missing opportunities to utilize capacity more
                 effectively. This paper presents, for the first time, a
                 detailed design space exploration of caches that
                 utilize statistical compression. We show that more
                 aggressive approaches like Huffman coding, which have
                 been neglected in the past due to the high processing
                 overhead for (de)compression, are suitable techniques
                 for caches and memory. Based on our key observation
                 that value locality varies little over time and across
                 applications, we first demonstrate that the overhead of
                 statistics acquisition for code generation is low
                 because new encodings are needed rarely, making it
                 possible to off-load it to software routines. We then
                 show that the high compression ratio obtained by
                 Huffman-coding makes it possible to utilize the
                 performance benefits of 4X larger last-level caches
                 with about 50\% lower power consumption than such
                 larger caches",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '14 conference proceedings.",
}

@Article{Seshadri:2014:DBI,
  author =       "Vivek Seshadri and Abhishek Bhowmick and Onur Mutlu
                 and Phillip B. Gibbons and Michael A. Kozuch and Todd
                 C. Mowry",
  title =        "The dirty-block index",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "3",
  pages =        "157--168",
  month =        jun,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2678373.2665697",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Dec 3 16:18:50 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "On-chip caches maintain multiple pieces of metadata
                 about each cached block --- e.g., dirty bit, coherence
                 information, ECC. Traditionally, such metadata for each
                 block is stored in the corresponding tag entry in the
                 tag store. While this approach is simple to implement
                 and scalable, it necessitates a full tag store lookup
                 for any metadata query --- resulting in high latency
                 and energy consumption. We find that this approach is
                 inefficient and inhibits several cache optimizations.
                 In this work, we propose a new way of organizing the
                 dirty bit information that enables simpler and more
                 efficient implementations of several optimizations. In
                 our proposed approach, we remove the dirty bits from
                 the tag store and organize it differently in a separate
                 structure, which we call the Dirty-Block Index (DBI).
                 The organization of DBI is simple: it consists of
                 multiple entries, each corresponding to some row in
                 DRAM. A bit vector in each entry tracks whether or not
                 each block in the corresponding DRAM row is dirty We
                 demonstrate the benefits of DBI by using it to
                 simultaneously and efficiently implement three
                 optimizations proposed by prior work: (1) Aggressive
                 DRAM-aware writeback, (2) Bypassing cache lookups, and
                 (3) Heterogeneous ECC for clean/dirty blocks. DBI, with
                 all three optimizations enabled, improves performance
                 by 31\% compared to the baseline (by 6\% compared to
                 the best previous mechanism) while reducing overall
                 cache area cost by 8\% compared to prior approaches.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '14 conference proceedings.",
}

@Article{Liu:2014:GVM,
  author =       "Lei Liu and Yong Li and Zehan Cui and Yungang Bao and
                 Mingyu Chen and Chengyong Wu",
  title =        "Going vertical in memory management: handling
                 multiplicity by multi-policy",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "3",
  pages =        "169--180",
  month =        jun,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2678373.2665698",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Dec 3 16:18:50 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Many emerging applications from various domains often
                 exhibit heterogeneous memory characteristics. When
                 running in combination on parallel platforms, these
                 applications present a daunting variety of workload
                 behaviors that challenge the effectiveness of any
                 memory allocation strategy. Prior partitioning-based or
                 random memory allocation schemes typically manage only
                 one level of the memory hierarchy and often target
                 specific workloads. To handle diverse and dynamically
                 changing memory and cache allocation needs, we augment
                 existing ``horizontal'' cache/DRAM bank partitioning
                 with vertical partitioning and explore the resulting
                 multi-policy space. We study the performance of these
                 policies for over 2000 workloads and correlate the
                 results with application characteristics via a data
                 mining approach. Based on this correlation we derive
                 several practical memory allocation rules that we
                 integrate into a unified multi-policy framework to
                 guide resources partitioning and coalescing for dynamic
                 and diverse multiprogrammed/ threaded workloads. We
                 implement our approach in Linux kernel 2.6.32 as a
                 restructured page indexing system plus a series of
                 kernel modules. Extensive experiments show that, in
                 practice, our framework can select proper memory
                 allocation policy and consistently outperforms the
                 unmodified Linux kernel, achieving up to 11\%
                 performance gains compared to prior techniques",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '14 conference proceedings.",
}

@Article{Orr:2014:FGT,
  author =       "Marc S. Orr and Bradford M. Beckmann and Steven K.
                 Reinhardt and David A. Wood",
  title =        "Fine-grain task aggregation and coordination on
                 {GPUs}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "3",
  pages =        "181--192",
  month =        jun,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2678373.2665701",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Dec 3 16:18:50 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "In general-purpose graphics processing unit (GPGPU)
                 computing, data is processed by concurrent threads
                 executing the same function. This model, dubbed
                 single-instruction/multiple-thread (SIMT), requires
                 programmers to coordinate the synchronous execution of
                 similar opera-tions across thousands of data elements.
                 To alleviate this programmer burden, Gaster and Howes
                 outlined the channel abstraction, which facilitates
                 dynamically aggregating asynchronously produced
                 fine-grain work into coarser-grain tasks. However, no
                 practical implementation has been proposed To this end,
                 we propose and evaluate the first channel
                 implementation. To demonstrate the utility of channels,
                 we present a case study that maps the fine-grain,
                 recursive task spawning in the Cilk programming
                 language to channels by representing it as a flow
                 graph. To support data-parallel recursion in bounded
                 memory, we propose a hardware mechanism that allows
                 wavefronts to yield their execution resources. Through
                 channels and wavefront yield, we implement four Cilk
                 benchmarks. We show that Cilk can scale with the GPU
                 architecture, achieving speedups of as much as 4.3x on
                 eight compute units",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '14 conference proceedings.",
}

@Article{Tanasic:2014:EPM,
  author =       "Ivan Tanasic and Isaac Gelado and Javier Cabezas and
                 Alex Ramirez and Nacho Navarro and Mateo Valero",
  title =        "Enabling preemptive multiprogramming on {GPUs}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "3",
  pages =        "193--204",
  month =        jun,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2678373.2665702",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Dec 3 16:18:50 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "GPUs are being increasingly adopted as compute
                 accelerators in many domains, spanning environments
                 from mobile systems to cloud computing. These systems
                 are usually running multiple applications, from one or
                 several users. However GPUs do not provide the support
                 for resource sharing traditionally expected in these
                 scenarios. Thus, such systems are unable to provide key
                 multiprogrammed workload requirements, such as
                 responsiveness, fairness or quality of service. In this
                 paper, we propose a set of hardware extensions that
                 allow GPUs to efficiently support multiprogrammed GPU
                 workloads. We argue for preemptive multitasking and
                 design two preemption mechanisms that can be used to
                 implement GPU scheduling policies. We extend the
                 architecture to allow concurrent execution of GPU
                 kernels from different user processes and implement a
                 scheduling policy that dynamically distributes the GPU
                 cores among concurrently running kernels, according to
                 their priorities. We extend the NVIDIA GK110 (Kepler)
                 like GPU architecture with our proposals and evaluate
                 them on a set of multiprogrammed workloads with up to
                 eight concurrent processes. Our proposals improve
                 execution time of high-priority processes by 15.6x, the
                 average application turnaround time between 1.5x to 2x,
                 and system fairness up to 3.4x",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '14 conference proceedings.",
}

@Article{Voitsechov:2014:SGM,
  author =       "Dani Voitsechov and Yoav Etsion",
  title =        "Single-graph multiple flows: energy efficient design
                 alternative for {GPGPUs}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "3",
  pages =        "205--216",
  month =        jun,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2678373.2665703",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Dec 3 16:18:50 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "We present the single-graph multiple-flows (SGMF)
                 architecture that combines coarse-grain reconfigurable
                 computing with dynamic dataflow to deliver massive
                 thread-level parallelism. The CUDA-compatible SGMF
                 architecture is positioned as an energy efficient
                 design alternative for GPGPUs. The architecture maps a
                 compute kernel, represented as a dataflow graph, onto a
                 coarse-grain reconfigurable fabric composed of a grid
                 of interconnected functional units. Each unit
                 dynamically schedules instances of the same static
                 instruction originating from different CUDA threads.
                 The dynamically scheduled functional units enable
                 streaming the data of multiple threads (or graph flows,
                 in SGMF parlance) through the grid. The combination of
                 statically mapped instructions and direct communication
                 between functional units obviate the need for a full
                 instruction pipeline and a centralized register file,
                 whose energy overheads burden GPGPU We show that the
                 SGMF architecture delivers performance comparable to
                 that of contemporary GPGPUs while consuming 57\% less
                 energy on average.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '14 conference proceedings.",
}

@Article{Campanoni:2014:HRA,
  author =       "Simone Campanoni and Kevin Brownell and Svilen Kanev
                 and Timothy M. Jones and Gu-Yeon Wei and David Brooks",
  title =        "{HELIX--RC}: an architecture-compiler co-design for
                 automatic parallelization of irregular programs",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "3",
  pages =        "217--228",
  month =        jun,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2678373.2665705",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Dec 3 16:18:50 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Data dependences in sequential programs limit
                 parallelization because extracted threads cannot run
                 independently. Although thread-level speculation can
                 avoid the need for precise dependence analysis,
                 communication overheads required to synchronize actual
                 dependences counteract the benefits of parallelization.
                 To address these challenges, we propose a lightweight
                 architectural enhancement co-designed with a
                 parallelizing compiler, which together can decouple
                 communication from thread execution. Simulations of
                 these approaches, applied to a processor with 16 Intel
                 Atom-like cores, show an average of 6.85x performance
                 speedup for six SPEC CINT2000 benchmarks",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '14 conference proceedings.",
}

@Article{Smith:2014:EDN,
  author =       "James E. Smith",
  title =        "Efficient digital neurons for large scale cortical
                 architectures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "3",
  pages =        "229--240",
  month =        jun,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2678373.2665707",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Dec 3 16:18:50 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Digital neurons are implemented with the goal of
                 sup-porting research and development of architectures
                 which implement the computational paradigm of the
                 neocortex. Four spiking digital neurons are implemented
                 at the register transfer level in a manner that permits
                 side-by-side comparisons. Two of the neurons contain
                 two stages of exponential decay, one for synapse
                 conductances and one for membrane potential. The other
                 two neurons contain only one stage of exponential decay
                 for membrane potential. The two stage neurons respond
                 to an input spike with a change in membrane potential
                 that has a non-infinite leading edge slope; the one
                 stage neurons exhibit a change in membrane potential
                 with an abrupt, infinite leading edge slope. This leads
                 to a behavioral difference when a number of input
                 spikes occur in very close time proximity. However, the
                 one stage neurons are as much as a factor of ten more
                 energy efficient than the two stage neurons, as
                 measured by the number of dynamic add-equivalent
                 operations. A new two stage neuron is proposed. This
                 neuron reduces the number of decay components and
                 implements decays in both stages via piece-wise linear
                 approximation. Together, these simplifications yield
                 two stage neuron behavior with energy efficiency that
                 is only about a factor of two worse than the simplest
                 one stage neuron.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '14 conference proceedings.",
}

@Article{Swaminathan:2014:EAS,
  author =       "Karthik Swaminathan and Huichu Liu and Jack Sampson
                 and Vijaykrishnan Narayanan",
  title =        "An examination of the architecture and system-level
                 tradeoffs of employing steep slope devices in {$3$D}
                 {CMPs}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "3",
  pages =        "241--252",
  month =        jun,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2678373.2665709",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Dec 3 16:18:50 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "For any given application, there is an optimal
                 throughput point in the space of per-processor
                 performance and the number of such processors given to
                 that application. However, due to thermal, yield, and
                 other constraints, not all of these optimal points can
                 plausibly be constructed with a given technology. In
                 this paper, we look at how emerging steep slope
                 devices, 3D circuit integration, and trends in process
                 technology scaling will combine to shift the boundaries
                 of both attainable performance, and the optimal set of
                 technologies to employ to achieve it. We propose a
                 heterogeneous-technology 3D architecture capable of
                 operating efficiently at an expanded number of points
                 in this larger design space and devise a heterogeneity
                 and thermal aware scheduling algorithm to exploit its
                 potential. Our heterogeneous mapping techniques are
                 capable of producing speedups ranging from 17\% for a
                 high end server workloads running at around 90${}^\circ
                 $C to over 160\% for embedded systems running below
                 60${}^\circ $C",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '14 conference proceedings.",
}

@Article{Venkatesan:2014:SST,
  author =       "Rangharajan Venkatesan and Shankar Ganesh
                 Ramasubramanian and Swagath Venkataramani and Kaushik
                 Roy and Anand Raghunathan",
  title =        "{STAG}: spintronic-tape architecture for {GPGPU} cache
                 hierarchies",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "3",
  pages =        "253--264",
  month =        jun,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2678373.2665710",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Dec 3 16:18:50 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "General-purpose Graphics Processing Units (GPGPUs) are
                 widely used for executing massively parallel workloads
                 from various application domains. Feeding data to the
                 hundreds to thousands of cores that current GPGPUs
                 integrate places great demands on the memory hierarchy,
                 fueling an ever-increasing demand for on-chip memory.
                 In this work, we propose STAG, a high density,
                 energy-efficient GPGPU cache hierarchy design using a
                 new spintronic memory technology called Domain Wall
                 Memory (DWM). DWMs inherently offer unprecedented
                 benefits in density by storing multiple bits in the
                 domains of a ferromagnetic nanowire, which logically
                 resembles a bit-serial tape. However, this structure
                 also leads to a unique challenge that the bits must be
                 sequentially accessed by performing ``shift''
                 operations, resulting in variable and potentially
                 higher access latencies. To address this challenge,
                 STAG utilizes a number of architectural techniques :
                 (i) a hybrid cache organization that employs different
                 DWM bit-cells to realize the different memory arrays
                 within the GPGPU cache hierarchy, (ii) a clustered,
                 bit-interleaved organization, in which the bits in a
                 cache block are spread across a cluster of DWM tapes,
                 allowing parallel access, (iii) tape head management
                 policies that predictively configure DWM arrays to
                 reduce the expected number of shift operations for
                 subsequent accesses, and (iv) a shift aware promotion
                 buffer (SaPB), in which accesses to the DWM cache are
                 predicted based on intra-warp locality, and locations
                 that would incur a large shift penalty are promoted to
                 a smaller buffer. Over a wide range of benchmarks from
                 the Rodinia, ISPASS and Parboil suites, STAG achieves
                 significant benefits in performance (12.1\% over SRAM
                 and 5.8\% over STT-MRAM) and energy (3.3X over SRAM and
                 2.6X over STT-MRAM)",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '14 conference proceedings.",
}

@Article{Pelley:2014:MP,
  author =       "Steven Pelley and Peter M. Chen and Thomas F.
                 Wenisch",
  title =        "Memory persistency",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "3",
  pages =        "265--276",
  month =        jun,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2678373.2665712",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Dec 3 16:18:50 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Emerging nonvolatile memory technologies (NVRAM)
                 promise the performance of DRAM with the persistence of
                 disk. However, constraining NVRAM write order,
                 necessary to ensure recovery correctness, limits NVRAM
                 write concurrency and degrades throughput. We require
                 new memory interfaces to minimally describe write
                 constraints and allow high performance and high
                 concurrency data structures. These goals strongly
                 resemble memory consistency. Whereas memory consistency
                 concerns the order that memory operations are observed
                 between numerous processors, persistent memory systems
                 must constrain the order that writes occur with respect
                 to failure. We introduce memory persistency, a new
                 approach to designing persistent memory interfaces,
                 building on memory consistency. Similar to memory
                 consistency, memory persistency models may be relaxed
                 to improve performance. We describe the design space of
                 memory persistency and desirable features that such a
                 memory system requires. Finally, we introduce several
                 memory persistency models and evaluate their ability to
                 expose NVRAM write concurrency using two
                 implementations of a persistent queue. Our results show
                 that relaxed persistency models accelerate system
                 throughput 30-fold by reducing NVRAM write
                 constraints",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '14 conference proceedings.",
}

@Article{Hoseinzadeh:2014:RAL,
  author =       "Morteza Hoseinzadeh and Mohammad Arjomand and Hamid
                 Sarbazi-Azad",
  title =        "Reducing access latency of {MLC PCMs} through line
                 striping",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "3",
  pages =        "277--288",
  month =        jun,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2678373.2665713",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Dec 3 16:18:50 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Although phase change memory with multi-bit storage
                 capability (known as MLC PCM) offers a good combination
                 of high bit-density and non-volatility, its performance
                 is severely impacted by the increased read/write
                 latency. Regarding read operation, access latency
                 increases almost linearly with respect to cell density
                 (the number of bits stored in a cell). Since reads are
                 latency critical, they can seriously impact system
                 performance. This paper alleviates the problem of slow
                 reads in the MLC PCM by exploiting a fundamental
                 property of MLC devices: the Most-Significant Bit (MSB)
                 of MLC cells can be read as fast as SLC cells, while
                 reading the Least-Significant Bits (LSBs) is slower. We
                 propose Striped PCM (SPCM), a memory architecture that
                 leverages this property to keep MLC read latency in the
                 order of SLC's. In order to avoid extra writes onto
                 memory cells as a result of striping memory lines, the
                 proposed design uses a pairing write queue to
                 synchronize write-back requests associated with blocks
                 that are paired in striping mode. Our evaluation shows
                 that our design significantly improves the average
                 memory access latency by more than 30\% and IPC by up
                 to 25\% (10\%, on average), with a slight overhead in
                 memory energy (0.7\%) in a 4-core CMP model running
                 memory-intensive benchmarks",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '14 conference proceedings.",
}

@Article{Jung:2014:HHI,
  author =       "Myoungsoo Jung and Wonil Choi and Shekhar Srikantaiah
                 and Joonhyuk Yoo and Mahmut T. Kandemir",
  title =        "{HIOS}: a host interface {I/O} scheduler for solid
                 state disks",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "3",
  pages =        "289--300",
  month =        jun,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2678373.2665715",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Dec 3 16:18:50 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Garbage collection (GC) and resource contention on I/O
                 buses (channels) are among the critical bottlenecks in
                 Solid State Disks (SSDs) that cannot be easily hidden.
                 Most existing I/O scheduling algorithms in the host
                 interface logic (HIL) of state-of-the-art SSDs are
                 oblivious to such low-level performance bottlenecks in
                 SSDs. As a result, SSDs may violate quality of service
                 (QoS) requirements by not being able to meet the
                 deadlines of I/O requests. In this paper, we propose a
                 novel host interface I/O scheduler that is both
                 GC-aware and QoS-aware. The proposed scheduler
                 redistributes the GC overheads across non-critical I/O
                 requests and reduces channel resource contention. Our
                 experiments with workloads from various application
                 domains reveal that the proposed scheduler reduces the
                 standard deviation for latency over state-of-the-art
                 I/O schedulers used in the HIL by 52.5\%, and the
                 worst-case latency by 86.6\%. In addition, for I/O
                 requests with sizes smaller than a superpage, our
                 proposed scheduler avoids channel resource conflicts
                 and reduces latency by 29.2\% compared to the
                 state-of-the-art",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '14 conference proceedings.",
}

@Article{Lo:2014:TEP,
  author =       "David Lo and Liqun Cheng and Rama Govindaraju and Luiz
                 Andr{\'e} Barroso and Christos Kozyrakis",
  title =        "Towards energy proportionality for large-scale
                 latency-critical workloads",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "3",
  pages =        "301--312",
  month =        jun,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2678373.2665718",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Dec 3 16:18:50 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Reducing the energy footprint of warehouse-scale
                 computer (WSC) systems is key to their affordability,
                 yet difficult to achieve in practice. The lack of
                 energy proportionality of typical WSC hardware and the
                 fact that important workloads (such as search) require
                 all servers to remain up regardless of traffic
                 intensity renders existing power management techniques
                 ineffective at reducing WSC energy use. We present
                 PEGASUS, a feedback-based controller that significantly
                 improves the energy proportionality of WSC systems, as
                 demonstrated by a real implementation in a Google
                 search cluster. PEGASUS uses request latency statistics
                 to dynamically adjust server power management limits in
                 a fine-grain manner, running each server just fast
                 enough to meet global service-level latency objectives.
                 In large cluster experiments, PEGASUS reduces power
                 consumption by up to 20\%. We also estimate that a
                 distributed version of PEGASUS can nearly double these
                 savings",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '14 conference proceedings.",
}

@Article{Liu:2014:SRJ,
  author =       "Yanpei Liu and Stark C. Draper and Nam Sung Kim",
  title =        "{SleepScale}: runtime joint speed scaling and sleep
                 states management for power efficient data centers",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "3",
  pages =        "313--324",
  month =        jun,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2678373.2665719",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Dec 3 16:18:50 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Power consumption in data centers has been growing
                 significantly in recent years. To reduce power, servers
                 are being equipped with increasingly sophisticated
                 power management mechanisms. Different mechanisms offer
                 dramatically different trade-offs between power savings
                 and performance penalties. Considering the complexity,
                 variety, and temporally varying nature of the
                 applications hosted in a typical data center,
                 intelligently determining which power management policy
                 to use and when is a complicated task. In this paper we
                 analyze a system model featuring both performance
                 scaling and low-power states. We reveal the interplay
                 between performance scaling and low-power states via
                 intensive simulation and analytic verification. Based
                 on the observations, we present SleepScale, a runtime
                 power management tool designed to efficiently exploit
                 existing power control mechanisms. At run time,
                 SleepScale characterizes power consumption and
                 quality-of-service (QoS) for each low-power state and
                 frequency setting, and selects the best policy for a
                 given QoS constraint. We evaluate SleepScale using
                 workload traces from data centers and achieve
                 significant power savings relative to conventional
                 power management strategies",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '14 conference proceedings.",
}

@Article{Liu:2014:OVM,
  author =       "Ming Liu and Tao Li",
  title =        "Optimizing virtual machine consolidation performance
                 on {NUMA} server architecture for cloud workloads",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "3",
  pages =        "325--336",
  month =        jun,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2678373.2665720",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Dec 3 16:18:50 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "Server virtualization and workload consolidation
                 enable multiple workloads to share a single physical
                 server, resulting in significant energy savings and
                 utilization improvements. The shift of physical server
                 architectures to NUMA and the increasing popularity of
                 scale-out cloud applications undermine workload
                 consolidation efficiency and result in overall system
                 degradation. In this work, we characterize the
                 consolidation of cloud workloads on NUMA virtualized
                 systems, estimate four different sources of
                 architecture overhead, and explore optimization
                 opportunities beyond the default NUMA-aware hypervisor
                 memory management Motivated by the observed
                 architectural impact on cloud workload consolidation
                 performance, we propose three optimization techniques
                 incorporating NUMA access overhead into the
                 hypervisor's virtual machine memory allocation and page
                 fault handling routines. Among these, estimation of the
                 memory zone access overhead serves as a foundation for
                 the other two techniques: a NUMA overhead aware buddy
                 allocator and a P2M swap FIFO. Cache hit rate, cycle
                 loss due to cache miss, and IPC serve as indicators to
                 estimate the access cost of each memory node. Our
                 optimized buddy allocator dynamically selects
                 low-overhead memory zones and ``proportionally''
                 distributes memory pages across target nodes. The P2M
                 swap FIFO records recently unused PFN, MFN lists for
                 mapping exchanges to rebalance memory access pressure
                 within one domain. Our real system based evaluations
                 show a 41.1\% performance improvement when
                 consolidating 16-VMs on a 4-socket server (the proposed
                 allocator contributes 22.8\% of the performance gain
                 and the P2M swap FIFO accounts for the rest).
                 Furthermore, our techniques can cooperate well with
                 other methods (i.e. vCPU migration) and scale well when
                 varying VM memory size and the number of sockets in a
                 physical host",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '14 conference proceedings.",
}

@Article{O:2014:RBD,
  author =       "Seongil O and Young Hoon Son and Nam Sung Kim and Jung
                 Ho Ahn",
  title =        "Row-buffer decoupling: a case for low-latency {DRAM}
                 microarchitecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "3",
  pages =        "337--348",
  month =        jun,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2678373.2665723",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Dec 3 16:18:50 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Modern DRAM devices for the main memory are structured
                 to have multiple banks to satisfy ever-increasing
                 throughput, energy-efficiency, and capacity demands.
                 Due to tight cost constraints, only one row can be
                 buffered (opened) per bank and actively service
                 requests at a time, while the row must be deactivated
                 (closed) before a new row is stored into the row
                 buffers. Hasty deactivation unnecessarily re-opens rows
                 for otherwise row-buffer hits while hindsight
                 accompanies the deactivation process on the critical
                 path of accessing data for row-buffer misses. The time
                 to (de)activate a row is comparable to the time to read
                 an open row while applications are often sensitive to
                 DRAM latency. Hence, it is critical to make the right
                 decision on when to close a row. However, the
                 increasing number of banks per DRAM device over
                 generations reduces the number of requests per bank.
                 This forces a memory controller to frequently predict
                 when to close a row due to a lack of information on
                 future requests, while the dynamic nature of memory
                 access patterns limits the prediction accuracy In this
                 paper, we propose a novel DRAM microarchitecture that
                 can eliminate the need for any prediction. First, we
                 identify that precharging the bitlines dominates the
                 deactivate time, while sense amplifiers that work as a
                 row buffer are physically coupled with the bitlines
                 such that a single command precharges both bitlines and
                 sense amplifiers simultaneously. By decoupling the
                 bitlines from the row buffers using isolation
                 transistors, the bitlines can be precharged right after
                 a row becomes activated. Therefore, only the sense
                 amplifiers need to be precharged for a miss in most
                 cases, taking an order of magnitude shorter time than
                 the conventional deactivation process. Second, we show
                 that this row-buffer decoupling enables internal DRAM
                 ?-operations to be separated and recombined, which can
                 be exploited by memory controllers to make the main
                 memory system more energy efficient. Our experiments
                 demonstrate that row-buffer decoupling improves the
                 geometric mean of the instructions per cycle and
                 MIPS2/W by 14\% and 29\%, respectively, for
                 memory-intensive SPEC CPU2006 applications",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '14 conference proceedings.",
}

@Article{Zhang:2014:HDH,
  author =       "Tao Zhang and Ke Chen and Cong Xu and Guangyu Sun and
                 Tao Wang and Yuan Xie",
  title =        "{Half-DRAM}: a high-bandwidth and low-power {DRAM}
                 architecture from the rethinking of fine-grained
                 activation",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "3",
  pages =        "349--360",
  month =        jun,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2678373.2665724",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Dec 3 16:18:50 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "DRAM memory is a major contributor for the total power
                 consumption in modern computing systems. Consequently,
                 power reduction for DRAM memory is critical to improve
                 system-level power efficiency. Fine-grained DRAM
                 architecture [1, 2] has been proposed to reduce the
                 activation/ precharge power. However, those prior work
                 either incurs significant performance degradation or
                 introduces large area overhead. In this paper, we
                 propose a novel memory architecture Half-DRAM, in which
                 the DRAM array is reorganized to enable only half of a
                 row being activated. The half-row activation can
                 effectively reduce activation power and meanwhile
                 sustain the full bandwidth one bank can provide. In
                 addition, the half-row activation in Half-DRAM relaxes
                 the power constraint in DRAM, and opens up
                 opportunities for further performance gain.
                 Furthermore, two half-row accesses can be issued in
                 parallel by integrating the sub-array level parallelism
                 to improve the memory level parallelism. The
                 experimental results show that Half-DRAM can achieve
                 both significant performance improvement and power
                 reduction, with negligible design overhead",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '14 conference proceedings.",
}

@Article{Kim:2014:FBM,
  author =       "Yoongu Kim and Ross Daly and Jeremie Kim and Chris
                 Fallin and Ji Hye Lee and Donghyuk Lee and Chris
                 Wilkerson and Konrad Lai and Onur Mutlu",
  title =        "Flipping bits in memory without accessing them: an
                 experimental study of {DRAM} disturbance errors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "3",
  pages =        "361--372",
  month =        jun,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2678373.2665726",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Dec 3 16:18:50 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Memory isolation is a key property of a reliable and
                 secure computing system --- an access to one memory
                 address should not have unintended side effects on data
                 stored in other addresses. However, as DRAM process
                 technology scales down to smaller dimensions, it
                 becomes more difficult to prevent DRAM cells from
                 electrically interacting with each other. In this
                 paper, we expose the vulnerability of commodity DRAM
                 chips to disturbance errors. By reading from the same
                 address in DRAM, we show that it is possible to corrupt
                 data in nearby addresses. More specifically, activating
                 the same row in DRAM corrupts data in nearby rows. We
                 demonstrate this phenomenon on Intel and AMD systems
                 using a malicious program that generates many DRAM
                 accesses. We induce errors in most DRAM modules (110
                 out of 129) from three major DRAM manufacturers. From
                 this we conclude that many deployed systems are likely
                 to be at risk. We identify the root cause of
                 disturbance errors as the repeated toggling of a DRAM
                 row's wordline, which stresses inter-cell coupling
                 effects that accelerate charge leakage from nearby
                 rows. We provide an extensive characterization study of
                 disturbance errors and their behavior using an
                 FPGA-based testing platform. Among our key findings, we
                 show that (i) it takes as few as 139K accesses to
                 induce an error and (ii) up to one in every 1.7K cells
                 is susceptible to errors. After examining various
                 potential ways of addressing the problem, we propose a
                 low-overhead solution to prevent the errors",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '14 conference proceedings.",
}

@Article{Zhang:2014:AIP,
  author =       "Runjie Zhang and Ke Wang and Brett H. Meyer and Mircea
                 R. Stan and Kevin Skadron",
  title =        "Architecture implications of pads as a scarce
                 resource",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "3",
  pages =        "373--384",
  month =        jun,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2678373.2665728",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Dec 3 16:18:50 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Due to non-ideal technology scaling, delivering a
                 stable supply voltage is increasingly challenging.
                 Furthermore, competition for limited chip interface
                 resources (i.e., C4 pads) between power supply and I/O,
                 and the loss of such resources to electromigration,
                 means that constructing a power delivery network (PDN)
                 that satisfies noise margins without compromising
                 performance is and will remain a critical problem for
                 architects and circuit designers alike. Simple
                 guardbanding will no longer work, as the consequent
                 performance penalty will grow with technology scaling
                 In this paper, we develop a pre-RTL PDN model,
                 VoltSpot, for the purpose of studying the performance
                 and noise tradeoffs among power supply and I/O pad
                 allocation, the effectiveness of noise mitigation
                 techniques, and the consequent implications of
                 electromigration-induced PDN pad failure. Our
                 simulations demonstrate that, despite their integral
                 role in the PDN, power/ground pads can be aggressively
                 reduced (by conversion into I/O pads) to their
                 electromigration limit with minimal performance impact
                 from extra voltage noise --- provided the system
                 implements a suitable noise-mitigation strategy. The
                 key observation is that even though reducing
                 power/ground pads significantly increases the number of
                 voltage emergencies, the average noise amplitude
                 increase is small. Overall, we can triple I/O bandwidth
                 while maintaining target lifetimes and incurring only
                 1.5\% slowdown",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '14 conference proceedings.",
}

@Article{Chen:2014:ICB,
  author =       "Shaoming Chen and Yue Hu and Ying Zhang and Lu Peng
                 and Jesse Ardonne and Samuel Irving and Ashok
                 Srivastava",
  title =        "Increasing off-chip bandwidth in multi-core processors
                 with switchable pins",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "3",
  pages =        "385--396",
  month =        jun,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2678373.2665730",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Dec 3 16:18:50 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Off-chip memory bandwidth has been considered as one
                 of the major limiting factors to processor performance,
                 especially for multi-cores and many-cores. Conventional
                 processor design allocates a large portion of off-chip
                 pins to deliver power, leaving a small number of pins
                 for processor signal communication. We observed that
                 the processor requires much less power than that can be
                 supplied during memory intensive stages. This is due to
                 the fact that the frequencies of processor cores
                 waiting for data to be fetched from off-chip memories
                 can be scaled down in order to save power without
                 degrading performance. In this work, motivated by this
                 observation, we propose a dynamic pin switch technique
                 to alleviate the bandwidth limitation issue. The
                 technique is introduced to dynamically exploit the
                 surplus pins for power delivery in the memory intensive
                 phases and uses them to provide extra bandwidth for the
                 program executions, thus significantly boosting the
                 performance",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '14 conference proceedings.",
}

@Article{Jiang:2014:LPR,
  author =       "Lei Jiang and Bo Zhao and Jun Yang and Youtao Zhang",
  title =        "A low power and reliable charge pump design for phase
                 change memories",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "3",
  pages =        "397--408",
  month =        jun,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2678373.2665731",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Dec 3 16:18:50 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "The emerging Phase Change Memory (PCM) technology
                 exhibits excellent scalability and density potentials.
                 At the same time, they require high current and high
                 voltages to switch cell states. Their working voltages
                 are provided by CMOS-compatible on-chip charge pumps
                 (CPs). Unfortunately, CPs and particularly those for
                 RESET, have a large parasitic power (a dominant
                 component in total power loss) during operations, which
                 significantly degrades their energy efficiency. In
                 addition, CPs seriously suffer from the Time-Dependent
                 Dielectric Breakdown (TDDB) problem due to their
                 boosted operation voltage. To maintain a reasonable
                 lifetime of CPs, existing solutions actively switch
                 them on per-operation basis, resulting in large
                 performance degradation In this paper, we address the
                 above issues through two designs --- Reset_Sch (RESET
                 scheduling) and CP_Sch (CP scheduling). Reset_Sch
                 schedules when to perform a RESET for different cells
                 upon writing a PCM line. It significantly reduces the
                 power loss, and peak working power of RESET CP. CP_Sch
                 incorporates a fast READ CP design to provide fast
                 charge-up time for reads and minimize performance
                 penalty. Our experimental results show that on average,
                 70\% of power loss for RESET CP can be reduced; and
                 performance loss can be reduced from 16\% to 2\% while
                 achieving a 16\% improvement in reliability",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '14 conference proceedings.",
}

@Article{Voskuilen:2014:FCP,
  author =       "Gwendolyn Voskuilen and T. N. Vijaykumar",
  title =        "{Fractal++}: closing the performance gap between
                 fractal and conventional coherence",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "3",
  pages =        "409--420",
  month =        jun,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2678373.2665733",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Dec 3 16:18:50 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Cache coherence protocol bugs can cause multicores to
                 fail. Existing coherence verification approaches incur
                 state explosion at small scales or require considerable
                 human effort. As protocols' complexity and multicores'
                 core counts increase, verification continues to be a
                 challenge. Recently, researchers proposed fractal
                 coherence which achieves scalable verification by
                 enforcing observational equivalence between sub-systems
                 in the coherence protocol. A larger subsystem is
                 verified implicitly if a smaller sub-system has been
                 verified. Unfortunately, fractal protocols suffer from
                 two fundamental limitations: (1)
                 indirect-communication: sub-systems cannot directly
                 communicate and (2) partially-serial invalidations:
                 cores must be invalidated in a specific, serial order.
                 These limitations disallow common performance
                 optimizations used by conventional directory protocols:
                 reply forwarding where caches communicate directly and
                 parallel invalidations. Therefore, fractal protocols
                 lack performance scalability while directory protocols
                 lack verification scalability. To enable both
                 performance and verification scalability, we propose
                 Fractal++ which employs a new class of protocol
                 optimizations for verification-constrained
                 architectures: decoupled-replies, contention-hints, and
                 fully-parallel-fractal-invalidations. The first two
                 optimizations allow reply-forwarding-like performance
                 while the third optimization enables parallel
                 invalidations in fractal protocols. Unlike conventional
                 protocols, Fractal++ preserves observational
                 equivalence and hence is scalably verifiable. In
                 32-core simulations of single- and four-socket systems,
                 Fractal++ performs nearly as well as a directory
                 protocol while providing scalable verifiability whereas
                 the best-performing previous fractal protocol performs
                 8\% on average and up to 26\% worse with a
                 single-socket and 12\% on average and up to 34\% worse
                 with a longer-latency multi-socket system",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '14 conference proceedings.",
}

@Article{Qian:2014:ODB,
  author =       "Xuehai Qian and Benjamin Sahelices and Josep
                 Torrellas",
  title =        "{OmniOrder}: directory-based conflict serialization of
                 transactions",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "3",
  pages =        "421--432",
  month =        jun,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2678373.2665734",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Dec 3 16:18:50 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Effective execution of atomic blocks of instructions
                 (also called transactions) can enhance the performance
                 and programmability of multiprocessors. Atomic blocks
                 can be demarcated in software as in Transactional
                 Memory (TM) or dynamically generated by the hardware as
                 in aggressive implementations of strict memory
                 consistency. In most current designs, when two atomic
                 blocks conflict, one is squashed --- a performance loss
                 that is often unnecessary. To avoid this waste, this
                 paper presents OmniOrder, the first design that
                 efficiently executes conflicting atomic blocks
                 concurrently in a directory-based coherence
                 environment. The idea is to keep only non-speculative
                 data in the caches and, when the cache coherence
                 protocol transfers a line, include in the message the
                 history of speculative updates to the line. The
                 coherence protocol transitions are unmodified. We
                 evaluate OmniOrder with 64-core simulations. In a TM
                 environment, OmniOrder reduces the execution time of
                 the STAMP applications by an average of 18.4\% over a
                 scheme that squashes on conflict. In an environment
                 with SC enforcement with speculation, we run 11
                 programs that implement concurrent algorithms.
                 OmniOrder reduces the programs' execution time by an
                 average of 15.3\% relative to a scheme that squashes on
                 conflict. Finally, OmniOrder's communication overhead
                 of transferring the history of speculative updates is
                 negligible",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '14 conference proceedings.",
}

@Article{Qian:2014:PRR,
  author =       "Xuehai Qian and Benjamin Sahelices and Depei Qian",
  title =        "{Pacifier}: record and replay for relaxed-consistency
                 multiprocessors with distributed directory protocol",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "3",
  pages =        "433--444",
  month =        jun,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2678373.2665736",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Dec 3 16:18:50 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Record and Deterministic Replay (R\&R) of
                 multithreaded programs on relaxed-consistency
                 multiprocessors with distributed directory protocol has
                 been a long-standing open problem. The independently
                 developed RelaxReplay [8] solves the problem by
                 assuming write atomicity. This paper proposes Pacifier,
                 the first R\&R scheme to provide a solution without
                 assuming write atomicity. R\&R for relaxed-consistency
                 multiprocessors needs to detect, record and replay
                 Sequential Consistency Violations (SCV). Pacifier has
                 two key components: (i) Relog, a general memory
                 reordering logging and replay mechanism that can
                 reproduce SCVs in relaxed memory models, and (ii)
                 Granule, an SCV detection scheme in the record phase
                 with good precision, that indicates whether to record
                 with Relog. We show that Pacifier is a sweet spot in
                 the design space with a reasonable trade-off between
                 hardware and log overhead. An evaluation with
                 simulations of 16, 32 and 64 processors with Release
                 Consistency (RC) running SPLASH-2 applications
                 indicates that Pacifier incurs 3.9\% ~ 16\% larger
                 logs. The slowdown of Pacifier during replay is 10.1\%
                 ~ 30.5\% compared to native execution",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '14 conference proceedings.",
}

@Article{Honarmand:2014:RDL,
  author =       "Nima Honarmand and Josep Torrellas",
  title =        "Replay debugging: leveraging record and replay for
                 program debugging",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "3",
  pages =        "445--456",
  month =        jun,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2678373.2665737",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Dec 3 16:18:50 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Hardware-assisted Record and Deterministic Replay
                 (RnR) of programs has been proposed as a primitive for
                 debugging hard-to-repeat software bugs. However, simply
                 providing support for repeatedly stumbling on the same
                 bug does not help diagnose it. For bug diagnosis,
                 developers typically want to modify the code, e.g., by
                 creating and operating on new variables, or printing
                 state. Unfortunately, this renders the RnR log
                 inconsistent and makes Replay Debugging (i.e.,
                 debugging while using an RnR log for replay) dicey at
                 best This paper presents rdb, the first scheme for
                 replay debugging that guarantees exact replay. rdb
                 relies on two mechanisms. The first one is compiler
                 support to split the instrumented application into two
                 executables: one that is identical to the original
                 program binary, and another that encapsulates all the
                 added debug code. The second mechanism is a runtime
                 infrastructure that replays the application and,
                 without affecting it in any way, invokes the
                 appropriate debug code at the appropriate locations. We
                 describe an implementation of rdb based on LLVM and
                 Pin, and show an example of how rdb's replay debugging
                 helps diagnose a real bug",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '14 conference proceedings.",
}

@Article{Woodruff:2014:CCM,
  author =       "Jonathan Woodruff and Robert N. M. Watson and David
                 Chisnall and Simon W. Moore and Jonathan Anderson and
                 Brooks Davis and Ben Laurie and Peter G. Neumann and
                 Robert Norton and Michael Roe",
  title =        "The {CHERI} capability model: revisiting {RISC} in an
                 age of risk",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "3",
  pages =        "457--468",
  month =        jun,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2678373.2665740",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Dec 3 16:18:50 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Motivated by contemporary security challenges, we
                 reevaluate and refine capability-based addressing for
                 the RISC era. We present CHERI, a hybrid capability
                 model that extends the 64-bit MIPS ISA with
                 byte-granularity memory protection. We demonstrate that
                 CHERI enables language memory model enforcement and
                 fault isolation in hardware rather than software, and
                 that the CHERI mechanisms are easily adopted by
                 existing programs for efficient in-program memory
                 safety. In contrast to past capability models, CHERI
                 complements, rather than replaces, the ubiquitous
                 page-based protection mechanism, providing a migration
                 path towards deconflating data-structure protection and
                 OS memory management. Furthermore, CHERI adheres to a
                 strict RISC philosophy: it maintains a load-store
                 architecture and requires only single-cycle
                 instructions, and supplies protection primitives to the
                 compiler, language runtime, and operating system. We
                 demonstrate a mature FPGA implementation that runs the
                 FreeBSD operating system with a full range of software
                 and an open-source application suite compiled with an
                 extended LLVM to use CHERI memory protection. A limit
                 study compares published memory safety mechanisms in
                 terms of instruction count and memory overheads. The
                 study illustrates that CHERI is performance-competitive
                 even while providing assurance and greater flexibility
                 with simpler hardware",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '14 conference proceedings.",
}

@Article{Vilanova:2014:CPS,
  author =       "Llu{\"\i}s Vilanova and Muli Ben-Yehuda and Nacho
                 Navarro and Yoav Etsion and Mateo Valero",
  title =        "{CODOMs}: protecting software with code-centric memory
                 domains",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "3",
  pages =        "469--480",
  month =        jun,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2678373.2665741",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Dec 3 16:18:50 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Today's complex software systems are neither secure
                 nor reliable. The rudimentary software protection
                 primitives provided by current hardware forces systems
                 to run many distrusting software components (e.g.,
                 procedures, libraries, plugins, modules) in the same
                 protection domain, or otherwise suffer degraded
                 performance from address space switches. We present
                 CODOMs (COde-centric memory DOMains), a novel
                 architecture that can provide finer-grained isolation
                 between software components with effectively zero
                 run-time overhead, all at a fraction of the complexity
                 of other approaches. An implementation of CODOMs in a
                 cycle-accurate full-system x86 simulator demonstrates
                 that with the right hardware support, finer-grained
                 protection and run-time performance can peacefully
                 coexist.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '14 conference proceedings.",
}

@Article{Perais:2014:EPW,
  author =       "Arthur Perais and Andr{\'e} Seznec",
  title =        "{EOLE}: paving the way for an effective implementation
                 of value prediction",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "3",
  pages =        "481--492",
  month =        jun,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2678373.2665742",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Dec 3 16:18:50 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Even in the multicore era, there is a continuous
                 demand to increase the performance of single-threaded
                 applications. However, the conventional path of
                 increasing both issue width and instruction window size
                 inevitably leads to the power wall. Value prediction
                 (VP) was proposed in the mid 90's as an alternative
                 path to further enhance the performance of wide-issue
                 superscalar processors. Still, it was considered up to
                 recently that a performance-effective implementation of
                 Value Prediction would add tremendous complexity and
                 power consumption in almost every stage of the pipeline
                 Nonetheless, recent work in the field of VP has shown
                 that given an efficient confidence estimation
                 mechanism, prediction validation could be removed from
                 the out-of-order engine and delayed until commit time.
                 As a result, recovering from mispredictions via
                 selective replay can be avoided and a much simpler
                 mechanism --- pipeline squashing --- can be used, while
                 the out-of-order engine remains mostly unmodified. Yet,
                 VP and validation at commit time entails strong
                 constraints on the Physical Register File. Write ports
                 are needed to write predicted results and read ports
                 are needed in order to validate them at commit time,
                 potentially rendering the overall number of ports
                 unbearable. Fortunately, VP also implies that many
                 single-cycle ALU instructions have their operands
                 predicted in the front-end and can be executed
                 in-place, in-order. Similarly, the execution of
                 single-cycle instructions whose result has been
                 predicted can be delayed until commit time since
                 predictions are validated at commit time Consequently,
                 a significant number of instructions --- 10\% to 60\%
                 in our experiments --- can bypass the out-of-order
                 engine, allowing the reduction of the issue width,
                 which is a major contributor to both out-of-order
                 engine complexity and register file port requirement.
                 This reduction paves the way for a truly practical
                 implementation of Value Prediction. Furthermore, since
                 Value Prediction in itself usually increases
                 performance, our resulting {Early | Out-of-Order |
                 Late} Execution architecture, EOLE, is often more
                 efficient than a baseline VP-augmented 6-issue
                 superscalar while having a significantly narrower
                 4-issue out-of-order engine",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '14 conference proceedings.",
}

@Article{Czechowski:2014:IEE,
  author =       "Kenneth Czechowski and Victor W. Lee and Ed Grochowski
                 and Ronny Ronen and Ronak Singhal and Richard Vuduc and
                 Pradeep Dubey",
  title =        "Improving the energy efficiency of big cores",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "3",
  pages =        "493--504",
  month =        jun,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2678373.2665743",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Dec 3 16:18:50 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Traditionally, architectural innovations designed to
                 boost single-threaded performance incur overhead costs
                 which significantly increase power consumption. In many
                 cases the increase in power exceeds the improvement in
                 performance, resulting in a net increase in energy
                 consumption. Thus, it is reasonable to assume that
                 modern attempts to improve single-threaded performance
                 will have a negative impact on energy efficiency. This
                 has led to the belief that ``Big Cores'' are inherently
                 inefficient. To the contrary, we present a study which
                 finds that the increased complexity of the core
                 microarchitecture in recent generations of the IntelR
                 CoreTM processor have reduced both the time and energy
                 required to run various workloads. Moreover, taking out
                 the impact of process technology changes, our study
                 still finds the architecture and microarchitecture
                 changes --- such as the increase in SIMD width,
                 addition of the frontend caches, and the enhancement to
                 the out-of-order execution engine --- account for 1.2x
                 improvement in energy efficiency for these processors.
                 This paper provides real-world examples of how
                 architectural innovations can mitigate inefficiencies
                 associated with ``Big Cores'' --- for example, micro-op
                 caches obviate the costly decode of complex x86
                 instructions --- resulting in a core architecture that
                 is both high performance and energy efficient. It also
                 contributes to the understanding of how
                 microarchitecture affects performance, power and energy
                 efficiency by modeling the relationship between them",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '14 conference proceedings.",
}

@Article{StAmant:2014:GPC,
  author =       "Ren{\'e}e {St. Amant} and Amir Yazdanbakhsh and Jongse
                 Park and Bradley Thwaites and Hadi Esmaeilzadeh and
                  Arjang Hassibi and Luis Ceze and Doug Burger",
  title =        "General-purpose code acceleration with
                 limited-precision analog computation",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "3",
  pages =        "505--516",
  month =        jun,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2678373.2665746",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Dec 3 16:18:50 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "As improvements in per-transistor speed and energy
                 efficiency diminish, radical departures from
                 conventional approaches are becoming critical to
                 improving the performance and energy efficiency of
                 general-purpose processors. We propose a solution from
                 circuit to compiler --- that enables general-purpose
                 use of limited-precision, analog hardware to accelerate
                 ``approximable'' code --- code that can tolerate
                 imprecise execution. We utilize an algorithmic
                 transformation that automatically converts approximable
                 regions of code from a von Neumann model to an
                 ``analog'' neural model. We outline the challenges of
                 taking an analog approach, including restricted-range
                 value encoding, limited precision in computation,
                 circuit inaccuracies, noise, and constraints on
                 supported topologies. We address these limitations with
                 a combination of circuit techniques, a
                 hardware/software interface, neural network training
                 techniques, and compiler support. Analog neural
                 acceleration provides whole application speedup of 3.7x
                 and energy savings of 6.3x with quality loss less than
                 10\% for all except one benchmark. These results show
                 that using limited-precision analog circuits for code
                 acceleration, through a neural approach, is both
                 feasible and beneficial over a range of
                 approximation-tolerant, emerging applications including
                 financial analysis, signal processing, robotics, 3D
                 gaming, compression, and image processing",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '14 conference proceedings.",
}

@Article{Madhavan:2014:RLH,
  author =       "Advait Madhavan and Timothy Sherwood and Dmitri
                 Strukov",
  title =        "Race logic: a hardware acceleration for dynamic
                 programming algorithms",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "3",
  pages =        "517--528",
  month =        jun,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2678373.2665747",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Dec 3 16:18:50 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "We propose a novel computing approach, dubbed ``Race
                 Logic'', in which information, instead of being
                 represented as logic levels, as is done in conventional
                 logic, is represented as a timing delay. Under this new
                 information representation, computations can be
                 performed by observing the relative propagation times
                 of signals injected into the circuit (i.e. the outcome
                 of races). Race Logic is especially suited for solving
                 problems related to the",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '14 conference proceedings.",
}

@Article{Arnau:2014:ERF,
  author =       "Jose-Maria Arnau and Joan-Manuel Parcerisa and
                 Polychronis Xekalakis",
  title =        "Eliminating redundant fragment shader executions on a
                 mobile {GPU} via hardware memoization",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "3",
  pages =        "529--540",
  month =        jun,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2678373.2665748",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Dec 3 16:18:50 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Redundancy is at the heart of graphical applications.
                 In fact, generating an animation typically involves the
                 succession of extremely similar images. In terms of
                 rendering these images, this behavior translates into
                 the creation of many fragment programs with the exact
                 same input data. We have measured this fragment
                 redundancy for a set of commercial Android
                 applications, and found that more than 40\% of the
                 fragments used in a frame have been already computed in
                 a prior frame. In this paper we try to exploit this
                 redundancy, using fragment memoization. Unfortunately,
                 this is not an easy task as most of the redundancy
                 exists across frames, rendering most HW based schemes
                 unfeasible. We thus first take a step back and try to
                 analyze the temporal locality of the redundant
                 fragments, their complexity, and the number of inputs
                 typically seen in fragment programs. The result of our
                 analysis is a task level memoization scheme, that
                 easily outperforms the current state-of-the-art in low
                 power GPUs More specifically, our experimental results
                 show that our scheme is able to remove 59.7\% of the
                 redundant fragment computations on average. This
                 materializes to a significant speedup of 17.6\% on
                 average, while also improving the overall energy
                 efficiency by 8.9\% on average.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '14 conference proceedings.",
}

@Article{Zhu:2014:WAS,
  author =       "Yuhao Zhu and Vijay Janapa Reddi",
  title =        "{WebCore}: architectural support for mobile {Web}
                 browsing",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "3",
  pages =        "541--552",
  month =        jun,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2678373.2665749",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Dec 3 16:18:50 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "The Web browser is undoubtedly the single most
                 important application in the mobile ecosystem. An
                 average user spends 72 minutes each day using the
                 mobile Web browser. nWeb browser internal engines
                 (e.g., WebKit) are also growing in importance because
                 they provide a common substrate for developing various
                 mobile Web applications. In a user-driven, interactive,
                 and latency-sensitive environment, the browser's
                 performance is crucial. However, the
                 battery-constrained nature of mobile devices limits the
                 performance that we can deliver for mobile Web
                 browsing. As traditional general-purpose techniques to
                 improve performance and energy efficiency fall short,
                 we must employ domain-specific knowledge while still
                 maintaining general-purpose flexibility In this paper,
                 we first perform design-space exploration to identify
                 appropriate general-purpose architectures that uniquely
                 fit the characteristics of a popular Web browsing
                 engine. Despite our best effort, we discover sources of
                 energy inefficiency in these customized general-purpose
                 architectures. To mitigate these inefficiencies, we
                 propose, synthesize, and evaluate two new
                 domain-specific specializations, called the Style
                 Resolution Unit and the Browser Engine Cache. Our
                 optimizations boost energy efficiency and at the same
                 time improve mobile Web browsing performance. As
                 emerging mobile workloads increasingly rely more on Web
                 browser technologies, the type of optimizations we
                 propose will become important in the future and are
                 likely to have lasting widespread impact",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '14 conference proceedings.",
}

@Article{Kodama:2014:PFB,
  author =       "Yuetsu Kodama and Toshihiro Hanawa and Taisuke Boku
                 and Mitsuhisa Sato",
  title =        "{PEACH2}: an {FPGA}-based {PCIe} network device for
                 Tightly Coupled Accelerators",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "4",
  pages =        "3--8",
  month =        sep,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2693714.2693716",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Jun 3 11:27:35 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "In recent years, heterogeneous clusters using
                 accelerators are often used for high performance
                 computing systems. In such clusters, inter-node
                 communication between accelerators requires several
                 memory copies via CPU memory, and the communication
                 latency incurred severely reduces performance. To solve
                 this problem, we have been proposing a Tightly Coupled
                 Accelerators (TCA) architecture intended to reduce the
                 communication latency between accelerators over
                 different nodes. In the TCA architecture, PCI Express
                 packets are used for communication among GPUs over
                 nodes. We developed a communication chip that we call
                 the named PEACH2 chip, to help implement the TCA
                 architecture. In this paper, we describe the details of
                 the design and implementation of the PEACH2 chip, with
                 respect to its routing mechanism and its DMA controller
                 using FPGA. We evaluated the PEACH2 on a new platform
                 that uses the latest Xeon CPU, IvyBridge, and achieved
                 2.3 GBytes/sec between GPUs over nodes, while the
                 performance was only 880 MBytes/sec on the previous
                 platform with SandyBridge.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "HEART '14 conference proceedings.",
}

@Article{Nomura:2014:PAM,
  author =       "Shimpei Nomura and Takuji Mitsuishi and Jun Suzuki and
                 Yuki Hayashi and Masaki Kan and Hideharu Amano",
  title =        "Performance Analysis of the {Multi-GPU} System with
                 {ExpEther}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "4",
  pages =        "9--14",
  month =        sep,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2693714.2693717",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Jun 3 11:27:35 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "A GPU cluster in which each node provides a few GPUs
                 connected with PCIe (PCI Express) is commonly used for
                 acceleration of a large application program requiring
                 the performance beyond a single GPU. However, in such a
                 system, programmers are required to describe two
                 parallel programming between nodes in MPIs or other
                 message passing library as well as the fine grained
                 parallel programming for intra-GPUs. As a cost
                 effective alternative of such clusters, we propose a
                 novel multi-GPU system with ExpEther, a virtualization
                 technique which extends PCIe of a host CPU to Ethernet.
                 All devices connected by ExpEther can be treated as if
                 they were directly connected to the host. Evaluation
                 with two application programs with and without GPU-GPU
                 communication revealed that the proposed system with
                 four GPUs achieved 3.88 and 3.29 times performance
                 improvement respectively compared with a single GPU
                 system. Compared with GPU cluster system in which each
                 node provides a GPU, the proposed system achieved about
                 7\% and 30\% performance improvement, respectively.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "HEART '14 conference proceedings.",
}

@Article{Watanabe:2014:GAH,
  author =       "Tsuyoshi Watanabe and Naohito Nakasato",
  title =        "{GPU} Accelerated Hybrid Tree Algorithm for Collision
                 Less {$N$}-body Simulations",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "4",
  pages =        "15--20",
  month =        sep,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2693714.2693718",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Jun 3 11:27:35 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/bibnet/subjects/fastmultipole.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "We propose a hybrid tree algorithm for reducing
                 calculation and communication cost of collision-less
                 N-body simulations. The concept of our algorithm is
                 that we split interaction force into two parts:
                 hard-force from neighbor particles and soft-force from
                 distant particles, and applying different time
                 integration for the forces. For hard-force calculation,
                 we can efficiently reduce the calculation and
                 communication cost of the parallel tree code because we
                 only need data of neighbor particles for this part. We
                 implement the algorithm on GPU clusters to accelerate
                 force calculation for both hard and soft force. As the
                 result of implementing the algorithm on GPU clusters,
                 we were able to reduce the communication cost and the
                 total execution time to 40\% and 80\% of that of a
                 normal tree algorithm, respectively. In addition, the
                 reduction factor relative the normal tree algorithm is
                 smaller for large number of processes, and we expect
                 that the execution time can be ultimately reduced down
                 to about 70\% of the normal tree algorithm.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "HEART '14 conference proceedings.",
}

@Article{Tsuyama:2014:GFA,
  author =       "Haruhisa Tsuyama and Tsutomu Maruyama",
  title =        "{GPU} and {FPGA} Acceleration of Level Set Method",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "4",
  pages =        "21--25",
  month =        sep,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2693714.2693719",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Jun 3 11:27:35 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "The level set method is one of the most powerful image
                 segmentation methods. Its computational complexity,
                 however, is very high, and many approaches to reduce
                 the computation time have been proposed. In this paper,
                 we describe a new level set algorithm for parallel
                 processing, and its implementation on GPU and FPGA. The
                 computational complexity of this algorithm is higher
                 than previous algorithms, but it is possible to achieve
                 higher performance by parallel processing. We
                 implemented the algorithm on GeForce GTX780Ti, and
                 Xilinx XC7VX485T, and compared their performances.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "HEART '14 conference proceedings.",
}

@Article{Tanabe:2014:FAO,
  author =       "Yu Tanabe and Tsutomu Maruyama",
  title =        "Fast and Accurate Optical Flow Estimation using
                 {FPGA}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "4",
  pages =        "27--32",
  month =        sep,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2693714.2693720",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Jun 3 11:27:35 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "In this paper, we extend an approach used in the
                 stereo vision for the optical flow estimation to
                 achieve lower error rates. In the optical flow
                 estimation, two dimensional search is required, and
                 more hardware resources becomes necessary than the
                 stereo vision that requires only one dimensional
                 search. In our implementation, the target image is
                 divided into sub-images, and they are processed in turn
                 to reduce the required circuit size. The error rates by
                 our system is much lower than previous works, and its
                 processing speed is fast enough for practical
                 applications.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "HEART '14 conference proceedings.",
}

@Article{Torres-Huitzil:2014:AEI,
  author =       "Cesar Torres-Huitzil and Marco Aurelio
                 Nu{\~n}o-Maganda",
  title =        "Area-time Efficient Implementation of Local Adaptive
                 Image Thresholding in Reconfigurable Hardware",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "4",
  pages =        "33--38",
  month =        sep,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2693714.2693721",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Jun 3 11:27:35 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Local adaptive thresholding plays an important role in
                 image binarization since it is used to effectively
                 distinguish objects of interest from background
                 regions. This step affects the performance of further
                 processing stages in embedded computer vision
                 applications. In local thresholding, a threshold is
                 defined for each pixel as a function of all pixels
                 within a rectangular neighborhood, and as a
                 consequence, this yields a high computational cost
                 requiring significant processing time when thresholding
                 high resolution images or large data sets. This paper
                 presents an area-time efficient hardware implementation
                 of a local adaptive thresholding technique based on the
                 Bernsen algorithm targeted to a field programmable gate
                 array (FPGA) device. Experimental results show that the
                 proposed implementation is resource efficient and able
                 to process a 1024x1024 gray level image in less than 10
                 milliseconds independent of the neighborhood size. The
                 architecture demonstrates over 100-fold speedup
                 compared to a straightforward software implementation
                 of the original Bernsen algorithm on a desktop
                 computer.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "HEART '14 conference proceedings.",
}

@Article{Gohringer:2014:RMS,
  author =       "Diana G{\"o}hringer",
  title =        "Reconfigurable Multiprocessor Systems: Handling
                 {Hydras} Heads --- A Survey",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "4",
  pages =        "39--44",
  month =        sep,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2693714.2693722",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Jun 3 11:27:35 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Novel solutions are needed to fulfill the increasing
                 demands of embedded systems, i.e. lowering the energy
                 consumption, increasing the performance, reducing the
                 development time and keeping the costs as low as
                 possible. In addition, there exist several
                 applications, which require runtime adaptations of the
                 algorithms based on the connection to its environment.
                 These challenges can be solved by using reconfigurable
                 Multiprocessor Systems-on-Chip (MPSoCs), which can
                 adapt the hardware as well as the software to the
                 application requirements and therefore achieve a high
                 computational efficiency as well as a high flexibility.
                 However, the development, the programming and the
                 operation of such flexible and heterogeneous systems is
                 very complex as the many criteria (Performance, power
                 consumption, costs, development time, runtime
                 adaptations, etc.) open a huge design space. In this
                 paper an overview of the challenges faced when
                 developing runtime adaptive MPSoCs is given. Finally,
                 for each challenge a survey of possible solutions are
                 presented.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "HEART '14 conference proceedings.",
}

@Article{Sano:2014:FBC,
  author =       "Kentaro Sano and Ryotaro Chiba and Tomoya Ueno and
                 Hayato Suzuki and Ryo Ito and Satoru Yamamoto",
  title =        "{FPGA}-based Custom Computing Architecture for
                 Large-Scale Fluid Simulation with Building Cube
                 Method",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "4",
  pages =        "45--50",
  month =        sep,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2693714.2693723",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Jun 3 11:27:35 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "We are designing a custom computing machine for
                 large-scale fluid simulation with the building-cube
                 method (BCM). In BCM, parallel computation is performed
                 with cubes, each of which is an orthogonal grid with a
                 fixed resolution of cells. Although BCM is advantageous
                 in balancing loads with cubes, it also has a problem of
                 efficiency and scalability for computing with
                 general-purpose supercomputers due to insufficient
                 memory bandwidth and communication overhead of an
                 interconnection network. In this paper, we present a
                 custom computing architecture for FPGA-based scalable
                 BCM computation with a dedicated network, called an
                 accelerator domain network (ADN). We design a cube
                 engine which allows bandwidth-efficient computation of
                 cubes based on streamed stencil computation of the
                 fractional-step method. Through prototype
                 implementation, we evaluate the potential performance
                 of the architecture. For ALTERA Stratix V 28nm FPGA, we
                 estimate that a single FPGA has the peak performance of
                 107 GFlop/s in a single precision.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "HEART '14 conference proceedings.",
}

@Article{Wang:2014:GRS,
  author =       "Tao Wang and Guangyu Sun and Jiahua Chen and Jian Gong
                 and Haoyang Wu and Xiaoguang Li and Songwu Lu and Jason
                 Cong",
  title =        "{GRT}: a Reconfigurable {SDR} Platform with High
                 Performance and Usability",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "4",
  pages =        "51--56",
  month =        sep,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2693714.2693724",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Jun 3 11:27:35 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "The importance of software-defined radio (SDR)
                 continues to increase. However, existing SDR platforms
                 become less efficient as the wireless industry moves
                 towards Gigabit WiFi. In this work, we propose a novel
                 reconfigurable SDR platform named GRT. With the help of
                 reconfigurable architecture and corresponding software
                 support, SDR designs on GRT can leverage high
                 performance of the underlying hardware and provide
                 sufficient usability, including the support for
                 efficient modular design, commodity interface, good
                 programmability, code reusability, etc. We implement an
                 802.11a/g WiFi system on GRT to evaluate its
                 performance. The results demonstrate that GRT can
                 achieve a substantial improvement in usability while
                 still satisfying the performance requirement.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "HEART '14 conference proceedings.",
}

@Article{Ando:2014:CSF,
  author =       "Yuki Ando and Masataka Ogawa and Yuya Mizoguchi and
                 Kouta Kumagai and Miaw Torng-Der and Shinya Honda",
  title =        "A Case Study of {FPGA Blokus Duo} Solver by
                 System-Level Design",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "4",
  pages =        "57--62",
  month =        sep,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2693714.2693725",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Jun 3 11:27:35 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "This paper presents a case study to design a Blokus
                 Duo solver by using our system-level design toolkit
                 named SystemBuilder. We start with a modeling of the
                 Blokus nDuo solver by C language and communication APIs
                 which are provided by SystemBuilder. Then, we
                 iteratively verified and tuned the parameters in the
                 solver by running the model on a general computer in
                 order to improve the performance of the solver.
                 Finally, the implementation on FPGA was automatically
                 generated from the model by SystemBuilder. Despite the
                 FPGA implementation, we have never written hardware
                 description language throughout the case study. The
                 case study demonstrates the easiness to design system
                 on FPGA by System-level design tools.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "HEART '14 conference proceedings.",
}

@Article{Joldes:2014:SSH,
  author =       "Mioara Joldes and Valentina Popescu and Warwick
                 Tucker",
  title =        "Searching for Sinks for the {H{\'e}non} Map using a
                 Multiple-precision {GPU} Arithmetic Library",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "4",
  pages =        "63--68",
  month =        sep,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2693714.2693726",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Jun 3 11:27:35 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/fparith.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Today, GPUs represent an important hardware
                 development platform for many problems in dynamical
                 systems, where massive parallel computations are
                 needed. Beside that, many numerical studies of chaotic
                 dynamical systems require a computing precision higher
                 than common floating point (FP) formats. One such
                 application is locating invariant sets for chaotic
                 dynamical systems. In particular, we focus on
                 rigorously proving the existence of stable periodic
                 orbits for the H{\'e}non map for parameter values close
                 to the classical ones. For that, we present a
                 multiple-precision floating-point arithmetic library in
                 CUDA programming language for the NVIDIA GPU platform.
                 Our library extends the precision using so-called FP
                 expansions, where a number is represented as the
                 unevaluated sum of standard machine precision FP
                 numbers. This format offers the advantage of using
                 directly available and highly optimized hardware FP
                 operations. We generalize algorithms used by
                 multiple-precisions libraries such as Bailey's QD, or
                 the analogue GPU version, GQD.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "HEART '14 conference proceedings.",
}

@Article{Soejima:2014:MPF,
  author =       "Rie Soejima and Koji Okina and Keisuke Dohi and
                 Yuichiro Shibata and Kiyoshi Oguri",
  title =        "A Memory Profiling Framework for Stencil Computation
                 on an {FPGA} Accelerator with High Level Synthesis",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "4",
  pages =        "69--74",
  month =        sep,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2693714.2693727",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Jun 3 11:27:35 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "In this paper, we propose a framework to assist memory
                 access optimization for stencil computation on an FPGA
                 accelerator. Since the stencil computations such as
                 scientific simulations need large amounts of data,
                 efficient memory access is a key to achieving high
                 performance on FPGA accelerators. Therefore, we
                 implemented a stencil computation framework with a
                 memory performance profiler on MaxCompiler, which is
                 one of high level synthesis systems. The memory
                 profiler enables us to measure clock cycles for various
                 memory controller states; data transfer, stall, and
                 idle. We also implemented simple stencil computations
                 and practical FDTD electromagnetic field simulations on
                 top of the framework with various parameters to
                 evaluate and analyze memory performance. As a result of
                 execution experiments of the simple stencil
                 computations on a MAX34245A Data Flow Engine, it was
                 demonstrated that approximately 70\% of the peak memory
                 performance could be achieved for various stencil
                 types. On the other hand, the FDTD simulations, which
                 need many data streams, could not hit this memory
                 performance saturation point, because of increasing
                 complexity of memory controller modules. Through the
                 analysis of evaluation results obtained by our memory
                 performance profiling framework, a promising memory
                 access optimization approach for stencil computations
                 in which the complexity of the memory controller is
                 traded off against data access traffic is suggested.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "HEART '14 conference proceedings.",
}

@Article{Morishima:2014:PEG,
  author =       "Shin Morishima and Hiroki Matsutani",
  title =        "Performance Evaluations of Graph Database using {CUDA}
                 and {OpenMP} Compatible Libraries",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "4",
  pages =        "75--80",
  month =        sep,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2693714.2693728",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Jun 3 11:27:35 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Graph databases use graph structures to store data
                 sets as nodes, edges, and properties. They are used to
                 store and search the relationships between a large
                 number of nodes, such as social networking services and
                 recommendation engines that use customer social
                 graphs. Since computation cost for graph search queries
                 increases as the graph becomes large, in this paper we
                 accelerate the graph search functions (Dijkstra and A*
                 algorithms) of a graph database Neo4j using two ways:
                 multi-threaded library and CUDA library for graphics
                 processing units (GPUs). We use 100,000-node graphs
                 generated based on a degree distribution of Facebook
                 social graph for evaluations. Our multi-threaded and
                 GPU-based implementations require an auxiliary
                 adjacency matrix for a target graph. The results show
                 that, when we do not take into account additional
                 overhead to generate the auxiliary adjacency matrix,
                 multi-threaded version improves the Dijkstra and A*
                 search performance by 16.2x and 13.8x compared to the
                 original implementation. The GPU-based implementation
                 improves the Dijkstra and A* search performance by
                 26.2x and 32.8x. When we take into account the
                 overhead, although the speed-ups by our implementations
                 are reduced, by reusing the auxiliary adjacency matrix
                 for multiple graph search queries we can significantly
                 improve the graph search performance.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "HEART '14 conference proceedings.",
}

@Article{Mitsuishi:2014:ABF,
  author =       "Takuji Mitsuishi and Shimpei Nomura and Jun Suzuki and
                 Yuki Hayashi and Masaki Kan and Hideharu Amano",
  title =        "Accelerating Breadth First Search on {GPU--BOX}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "4",
  pages =        "81--86",
  month =        sep,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2693714.2693729",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Jun 3 11:27:35 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "The graph analysis has been applied in various fields
                 related to big-data processing and actively researched
                 in recent years. For processing a larger scale of
                 graph, parallel computing with multi-GPU system is paid
                 attention as an economical solution. Here, an efficient
                 parallel method is proposed to solve a typical graph
                 analysis, Breadth First Search (BFS) for multi-GPU
                 systems. Our target system is GPU-BOX, a prototype of
                 multi-GPU system using ExpEther which is a
                 virtualization technology based on PCI Express and
                 Ethernet. Although many vertices between GPUs must be
                 exchanged to run BFS on multi-GPU system, GPU-BOX
                 provides only small communication performance because
                 of using Ethernet. Our parallel algorithm for BFS is
                 designed so as to reduce the traffic between GPUs as
                 possible. The proposed method reduced 30-40\% traffic
                 between GPUs and improved the traditional parallel
                 method by 10\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "HEART '14 conference proceedings.",
}

@Article{Nunez-Yanez:2014:EER,
  author =       "Jose Nunez-Yanez",
  title =        "Energy efficient Reconfigurable Computing with
                 Adaptive Voltage and Logic scaling",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "4",
  pages =        "87--92",
  month =        sep,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2693714.2693730",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Jun 3 11:27:35 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "This paper investigates a novel energy-proportional
                 concept that combines closed-loop voltage scalability
                 and run-time hardware reconfiguration. Voltage scaling
                 is based on in-situ detectors that allow the device to
                 detect valid working voltage and frequency pairs at
                 run-time. The combined approach named AVLS (Adaptive
                 Voltage and Logic Scaling) enables the adaptation of
                 capacitance, voltage and frequency to obtain power and
                 energy savings based on workload, process and operating
                 conditions in a closed-loop configuration. The
                 technique is applied to a reconfigurable motion
                 estimation processor that can be configured with a
                 variable number of execution units and it is used as a
                 test vehicle. The results demonstrate that the proposed
                 voltage scaling can obtain up to 85\% reduction in
                 energy compared with nominal voltage operation at the
                 same frequency. This efficient energy point is obtained
                 at a voltage of 0.62 V and frequency of 56 MHz compared
                 with running the core at the same frequency and nominal
                 1 V. The addition of logic scalability means that if
                 enough device resources are available a parallel
                 configuration with six execution units operating at
                 0.62 V reduces energy by up to 95\% compared with a
                 single execution unit operating at 1 V and the same
                 frequency.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "HEART '14 conference proceedings.",
}

@Article{Thorson:2014:INb,
  author =       "Mark Thorson",
  title =        "{Internet} Nuggets",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "4",
  pages =        "93--101",
  month =        sep,
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2693714.2693732",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Jun 3 11:27:35 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "HEART '14 conference proceedings.",
}

@Article{Thorson:2014:INc,
  author =       "Mark Thorson",
  title =        "{Internet} Nuggets",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "42",
  number =       "4",
  pages =        "93--101",
  year =         "2014",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2693714.2693732",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Dec 3 16:18:50 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "HEART '14 conference proceedings.",
}

@Article{Ozturk:2015:ASC,
  author =       "Ozcan Ozturk",
  title =        "Architectural Support for Cyber-Physical Systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "43",
  number =       "1",
  pages =        "1--1",
  month =        mar,
  year =         "2015",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2786763.2694375",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Jun 3 11:27:38 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Cyber-physical systems are integrations of
                 computation, communication networks, and physical
                 dynamics. Although time plays a central role in the
                 physical world, all widely used software abstractions
                 lack temporal semantics. The notion of correct
                 execution of a program written in every widely-used
                 programming language today does not depend on the
                 temporal behavior of the program. But temporal behavior
                 matters in almost all systems, and most particularly in
                 cyber-physical systems. In this talk, I will argue that
                 time can and must become part of the semantics of
                 programs for a large class of applications. To
                 illustrate that this is both practical and useful, we
                 will describe a recent effort at Berkeley in the design
                 and implementation of timing-centric software systems.
                 Specifically, I will describe PRET machines, which
                 redefine the instruction-set architecture (ISA) of a
                 microprocessor to embrace temporal semantics. Such
                 machines can be used in high-confidence and
                 safety-critical systems, in energy-constrained systems,
                 in mixed-criticality systems, and as a Real-Time Unit
                 (RTU) that cooperates with a general-purpose processor
                 to provide real-time services, in a manner similar to
                 how a GPU provides graphics services.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'15 conference proceedings.",
}

@Article{Zhang:2015:MRH,
  author =       "Yiying Zhang and Jian Yang and Amirsaman Memaripour
                 and Steven Swanson",
  title =        "{Mojim}: a Reliable and Highly-Available Non-Volatile
                 Memory System",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "43",
  number =       "1",
  pages =        "3--18",
  month =        mar,
  year =         "2015",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2786763.2694370",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Jun 3 11:27:38 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Next-generation non-volatile memories (NVMs) promise
                 DRAM-like performance, persistence, and high density.
                 They can attach directly to processors to form
                 non-volatile main memory (NVMM) and offer the
                 opportunity to build very low-latency storage systems.
                 These high-performance storage systems would be
                 especially useful in large-scale data center
                 environments where reliability and availability are
                 critical. However, providing reliability and
                 availability to NVMM is challenging, since the latency
                 of data replication can overwhelm the low latency that
                 NVMM should provide. We propose Mojim, a system that
                 provides the reliability and availability that
                 large-scale storage systems require, while preserving
                 the performance of NVMM. Mojim achieves these goals by
                 using a two-tier architecture in which the primary tier
                 contains a mirrored pair of nodes and the secondary
                 tier contains one or more secondary backup nodes with
                 weakly consistent copies of data. Mojim uses
                 highly-optimized replication protocols, software, and
                 networking stacks to minimize replication costs and
                 expose as much of NVMM?s performance as possible. We
                 evaluate Mojim using raw DRAM as a proxy for NVMM and
                 using an industrial NVMM emulation system. We find that
                 Mojim provides replicated NVMM with similar or even
                 better performance than un-replicated NVMM (reducing
                 latency by 27\% to 63\% and delivering between 0.4 to
                 2.7X the throughput). We demonstrate that replacing
                 MongoDB's built-in replication system with Mojim
                 improves MongoDB's performance by 3.4 to 4X.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'15 conference proceedings.",
}

@Article{Wang:2015:SPC,
  author =       "Rujia Wang and Lei Jiang and Youtao Zhang and Jun
                 Yang",
  title =        "{SD--PCM}: Constructing Reliable Super Dense Phase
                 Change Memory under Write Disturbance",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "43",
  number =       "1",
  pages =        "19--31",
  month =        mar,
  year =         "2015",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2786763.2694352",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Jun 3 11:27:38 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Phase Change Memory (PCM) has better scalability and
                 smaller cell size comparing to DRAM. However, further
                 scaling PCM cell in deep sub-micron regime results in
                 significant thermal based write disturbance (WD).
                 Naively allocating large inter-cell space increases
                 cell size from 4F$^2$ ideal to 12F$^2$. While a recent
                 work mitigates WD along word-lines through disturbance
                 resilient data encoding, it is ineffective for WD along
                 bit-lines, which is more severe due to widely adopted $
                 \mu $Trench structure in constructing PCM cell arrays.
                 Without mitigating WD along bit-lines, a PCM cell still
                 has 8F2, which is 100\% larger than the ideal. In this
                 paper, we propose SD-PCM for achieving reliable write
                 operations in super dense PCM. In particular, we focus
                 on mitigating WD along bit-lines such that we can
                 construct super dense PCM chips with 4F$^2$ cell size,
                 i.e., the minimal for diode-switch based PCM. Based on
                 simple verification-n-correction (VnC), we propose
                 LazyCorrection and PreRead to effectively reduce VnC
                 overhead and minimize cascading verification during
                 write. We further propose (n:m)-Alloc for achieving
                 good tradeoff between VnC overhead minimization and
                 memory capacity loss. Our experimental results show
                 that, comparing to a WD-free low density PCM, SD-PCM
                 achieves 80\% capacity improvement in cell arrays while
                 incurring around 0-10\% performance degradation when
                 using different (n:m) allocators.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'15 conference proceedings.",
}

@Article{Young:2015:DWE,
  author =       "Vinson Young and Prashant J. Nair and Moinuddin K.
                 Qureshi",
  title =        "{DEUCE}: Write-Efficient Encryption for Non-Volatile
                 Memories",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "43",
  number =       "1",
  pages =        "33--44",
  month =        mar,
  year =         "2015",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2786763.2694387",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Jun 3 11:27:38 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Phase Change Memory (PCM) is an emerging Non Volatile
                 Memory (NVM) technology that has the potential to
                 provide scalable high-density memory systems. While the
                 non-volatility of PCM is a desirable property in order
                 to save leakage power, it also has the undesirable
                 effect of making PCM main memories susceptible to newer
                 modes of security vulnerabilities, for example,
                 accessibility to sensitive data if a PCM DIMM gets
                 stolen. PCM memories can be made secure by encrypting
                 the data. Unfortunately, such encryption comes with a
                 significant overhead in terms of bits written to PCM
                 memory, causing half of the bits in the line to change
                 on every write, even if the actual number of bits being
                 written to memory is small. Our studies show that a
                 typical writeback modifies, on average, only 12\% of
                 the bits in the cacheline. Thus, encryption causes
                 almost a 4x increase in the number of bits written to
                 PCM memories. Such extraneous bit writes cause
                 significant increase in write power, reduction in write
                 endurance, and reduction in write bandwidth. To provide
                 the benefit of secure memory in a write efficient
                 manner this paper proposes Dual Counter Encryption
                 (DEUCE). DEUCE is based on the observation that a
                 typical writeback only changes a few words, so DEUCE
                 reencrypts only the words that have changed. We show
                 that DEUCE reduces the number of modified bits per
                 writeback for a secure memory from 50\% to 24\%, which
                 improves performance by 27\% and increases lifetime by
                 2x.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'15 conference proceedings.",
}

@Article{Morrison:2015:TBT,
  author =       "Adam Morrison and Yehuda Afek",
  title =        "Temporally Bounding {TSO} for Fence-Free Asymmetric
                 Synchronization",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "43",
  number =       "1",
  pages =        "45--58",
  month =        mar,
  year =         "2015",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2786763.2694374",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Jun 3 11:27:38 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "This paper introduces a temporally bounded total store
                 ordering (TBTSO) memory model, and shows that it
                 enables nonblocking fence-free solutions to asymmetric
                 synchronization problems, such as those arising in
                 memory reclamation and biased locking. TBTSO
                 strengthens the TSO memory model by bounding the time
                 it takes a store to drain from the store buffer into
                 memory. This bound enables devising fence-free
                 algorithms for asymmetric problems, which require a
                 performance-critical fast path to synchronize with an
                 infrequently executed slow path. We demonstrate this by
                 constructing (1) a fence-free version of the hazard
                 pointers memory reclamation scheme, and (2) a
                 fence-free biased lock algorithm which is compatible
                 with unmanaged environments as it does not rely on safe
                 points or similar mechanisms. We further argue that
                 TBTSO can be implemented in hardware with modest
                 modifications to existing TSO architectures. However,
                 our design makes assumptions about proprietary
                 implementation details of commercial hardware; it thus
                 best serves as a starting point for a discussion on the
                 feasibility of hardware TBTSO implementation. We also
                 show how minimal OS support enables the adaptation of
                 TBTSO algorithms to x86 systems.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'15 conference proceedings.",
}

@Article{Matveev:2015:RHN,
  author =       "Alexander Matveev and Nir Shavit",
  title =        "Reduced Hardware {NOrec}: a Safe and Scalable Hybrid
                 Transactional Memory",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "43",
  number =       "1",
  pages =        "59--71",
  month =        mar,
  year =         "2015",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2786763.2694393",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Jun 3 11:27:38 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Because of hardware TM limitations, software fallbacks
                 are the only way to make TM algorithms guarantee
                 progress. Nevertheless, all known software fallbacks to
                 date, from simple locks to sophisticated versions of
                 the NOrec Hybrid TM algorithm, have either limited
                 scalability or weakened semantics. We propose a novel
                 reduced-hardware (RH) version of the NOrec HyTM
                 algorithm. Instead of an all-software slow path, in our
                 RH NOrec the slow-path is a ``mix'' of hardware and
                 software: one short hardware transaction executes a
                 maximal amount of initial reads in the hardware, and
                 the second executes all of the writes. This novel
                 combination of the RH approach and the NOrec algorithm
                 delivers the first Hybrid TM that scales while fully
                 preserving the hardware's original semantics of opacity
                 and privatization. Our GCC implementation of RH NOrec
                 is promising in that it shows improved performance
                 relative to all prior methods, at the concurrency
                 levels we could test today.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'15 conference proceedings.",
}

@Article{Orr:2015:SUR,
  author =       "Marc S. Orr and Shuai Che and Ayse Yilmazer and
                 Bradford M. Beckmann and Mark D. Hill and David A.
                 Wood",
  title =        "Synchronization Using Remote-Scope Promotion",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "43",
  number =       "1",
  pages =        "73--86",
  month =        mar,
  year =         "2015",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2786763.2694350",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Jun 3 11:27:38 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Heterogeneous system architecture (HSA) and OpenCL
                 define scoped synchronization to facilitate low
                 overhead communication across a subset of threads.
                 Scoped synchronization works well for static sharing
                 patterns, where consumer threads are known a priori. It
                 works poorly for dynamic sharing patterns (e.g., work
                 stealing) where programmers cannot use a faster small
                 scope due to the rare possibility that the work is
                 stolen by a thread in a distant slower scope. This puts
                 programmers in a conundrum: optimize the common case by
                 synchronizing at a faster small scope or use work
                 stealing at a slower large scope. In this paper, we
                 propose to extend scoped synchronization with
                 remote-scope promotion. This allows the most frequent
                 sharers to synchronize through a small scope.
                 Infrequent sharers synchronize by promoting that remote
                 small scope to a larger shared scope. Synchronization
                 using remote-scope promotion provides performance
                 robustness for dynamic workloads, where the benefits
                 provided by scoped synchronization and work stealing
                 are hard to anticipate. Compared to a na{\"\i}ve
                 baseline, static scoped synchronization alone achieves
                 a 1.07x speedup on average and dynamic work stealing
                 alone achieves a 1.18x speedup on average. In contrast,
                 synchronization using remote-scope promotion achieves a
                 robust 1.25x speedup on average, across a diverse set
                 of graph benchmarks and inputs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'15 conference proceedings.",
}

@Article{Liu:2015:GHS,
  author =       "Chang Liu and Austin Harris and Martin Maas and
                 Michael Hicks and Mohit Tiwari and Elaine Shi",
  title =        "{GhostRider}: a Hardware-Software System for Memory
                 Trace Oblivious Computation",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "43",
  number =       "1",
  pages =        "87--101",
  month =        mar,
  year =         "2015",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2786763.2694385",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Jun 3 11:27:38 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "This paper presents a new, co-designed compiler and
                 architecture called GhostRider for supporting privacy
                 preserving computation in the cloud. GhostRider ensures
                 all programs satisfy a property called memory-trace
                 obliviousness (MTO): Even an adversary that observes
                 memory, bus traffic, and access times while the program
                 executes can learn nothing about the program's
                 sensitive inputs and outputs. One way to achieve MTO is
                 to employ Oblivious RAM (ORAM), allocating all code and
                 data in a single ORAM bank, and to also disable caches
                 or fix the rate of memory traffic. This baseline
                 approach can be inefficient, and so GhostRider's
                 compiler uses a program analysis to do better,
                 allocating data to non-oblivious, encrypted RAM (ERAM)
                 and employing a scratchpad when doing so will not
                 compromise MTO. The compiler can also allocate to
                 multiple ORAM banks, which sometimes significantly
                 reduces access times.We have formalized our approach
                 and proved it enjoys MTO. Our FPGA-based hardware
                 prototype and simulation results show that GhostRider
                 significantly outperforms the baseline strategy.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'15 conference proceedings.",
}

@Article{Fletcher:2015:FON,
  author =       "Christopher W. Fletcher and Ling Ren and Albert Kwon
                 and Marten van Dijk and Srinivas Devadas",
  title =        "Freecursive {ORAM}: [Nearly] Free Recursion and
                 Integrity Verification for Position-based Oblivious
                 {RAM}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "43",
  number =       "1",
  pages =        "103--116",
  month =        mar,
  year =         "2015",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2786763.2694353",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Jun 3 11:27:38 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Oblivious RAM (ORAM) is a cryptographic primitive that
                 hides memory access patterns as seen by untrusted
                 storage. Recently, ORAM has been architected into
                 secure processors. A big challenge for hardware ORAM
                 schemes is how to efficiently manage the Position Map
                 (PosMap), a central component in modern ORAM
                 algorithms. Implemented naively, the PosMap causes ORAM
                 to be fundamentally unscalable in terms of on-chip
                 area. On the other hand, a technique called Recursive
                 ORAM fixes the area problem yet significantly increases
                 ORAM's performance overhead. To address this challenge,
                 we propose three new mechanisms. We propose a new ORAM
                 structure called the PosMap Lookaside Buffer (PLB) and
                 PosMap compression techniques to reduce the performance
                 overhead from Recursive ORAM empirically (the latter
                 also improves the construction asymptotically). Through
                 simulation, we show that these techniques reduce the
                 memory bandwidth overhead needed to support recursion
                 by 95\%, reduce overall ORAM bandwidth by 37\% and
                 improve overall SPEC benchmark performance by 1.27x. We
                 then show how our PosMap compression techniques further
                 facilitate an extremely efficient integrity
                 verification scheme for ORAM which we call PosMap MAC
                 (PMMAC). For a practical parameterization, PMMAC
                 reduces the amount of hashing needed for integrity
                 checking by $ \geq 68 \times $ relative to prior
                 schemes and introduces only 7\% performance overhead.
                 We prototype our mechanisms in hardware and report area
                 and clock frequency for a complete ORAM design
                 post-synthesis and post-layout using an ASIC flow in a
                 32~nm commercial process. With 2 DRAM channels, the
                 design post-layout runs at 1~GHz and has a total area
                 of .47~mm2. Depending on PLB-specific parameters, the
                 PLB accounts for 10\% to 26\% area. PMMAC costs 12\% of
                 total design area. Our work is the first to prototype
                 Recursive ORAM or ORAM with any integrity scheme in
                 hardware.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'15 conference proceedings.",
}

@Article{Chisnall:2015:BPA,
  author =       "David Chisnall and Colin Rothwell and Robert N. M.
                 Watson and Jonathan Woodruff and Munraj Vadera and
                 Simon W. Moore and Michael Roe and Brooks Davis and
                 Peter G. Neumann",
  title =        "Beyond the {PDP-11}: Architectural Support for a
                 Memory-Safe {C} Abstract Machine",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "43",
  number =       "1",
  pages =        "117--130",
  month =        mar,
  year =         "2015",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2786763.2694367",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Jun 3 11:27:38 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "We propose a new memory-safe interpretation of the C
                 abstract machine that provides stronger protection to
                 benefit security and debugging. Despite ambiguities in
                 the specification intended to provide implementation
                 flexibility, contemporary implementations of C have
                 converged on a memory model similar to the PDP-11, the
                 original target for C. This model lacks support for
                 memory safety despite well-documented impacts on
                 security and reliability. Attempts to change this model
                 are often hampered by assumptions embedded in a large
                 body of existing C code, dating back to the memory
                 model exposed by the original C compiler for the
                 PDP-11. Our experience with attempting to implement a
                 memory-safe variant of C on the CHERI experimental
                 microprocessor led us to identify a number of
                 problematic idioms. We describe these as well as their
                 interaction with existing memory safety schemes and the
                 assumptions that they make beyond the requirements of
                 the C specification. Finally, we refine the CHERI ISA
                 and abstract model for C, by combining elements of the
                 CHERI capability model and fat pointers, and present a
                 softcore CPU that implements a C abstract machine that
                 can run legacy C code with strong memory protection
                 guarantees.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'15 conference proceedings.",
}

@Article{Ma:2015:SDS,
  author =       "Jiuyue Ma and Xiufeng Sui and Ninghui Sun and Yupeng
                 Li and Zihao Yu and Bowen Huang and Tianni Xu and
                 Zhicheng Yao and Yun Chen and Haibin Wang and Lixin
                 Zhang and Yungang Bao",
  title =        "Supporting Differentiated Services in Computers via
                 Programmable Architecture for Resourcing-on-Demand
                 {(PARD)}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "43",
  number =       "1",
  pages =        "131--143",
  month =        mar,
  year =         "2015",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2786763.2694382",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Jun 3 11:27:38 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "This paper presents PARD, a programmable architecture
                 for resourcing-on-demand that provides a new
                 programming interface to convey an application's
                 high-level information like quality-of-service
                 requirements to the hardware. PARD enables new
                 functionalities like fully hardware-supported
                 virtualization and differentiated services in
                 computers. PARD is inspired by the observation that a
                 computer is inherently a network in which hardware
                 components communicate via packets (e.g., over the NoC
                 or PCIe). We apply principles of software-defined
                 networking to this intra-computer network and address
                 three major challenges. First, to deal with the
                 semantic gap between high-level applications and
                 underlying hardware packets, PARD attaches a high-level
                 semantic tag (e.g., a virtual machine or thread ID) to
                 each memory-access, I/O, or interrupt packet. Second,
                 to make hardware components more manageable, PARD
                 implements programmable control planes that can be
                 integrated into various shared resources (e.g., cache,
                 DRAM, and I/O devices) and can differentially process
                 packets according to tag-based rules. Third, to
                 facilitate programming, PARD abstracts all control
                 planes as a device file tree to provide a uniform
                 programming interface via which users create and apply
                 tag-based rules. Full-system simulation results show
                 that by co-locating latencycritical memcached
                 applications with other workloads PARD can improve a
                 four-core computer's CPU utilization by up to a factor
                 of four without significantly increasing tail latency.
                 FPGA emulation based on a preliminary RTL
                 implementation demonstrates that the cache control
                 plane introduces no extra latency and that the memory
                 control plane can reduce queueing delay for
                 high-priority memory-access requests by up to a factor
                 of 5.6.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'15 conference proceedings.",
}

@Article{Omote:2015:IAE,
  author =       "Yushi Omote and Takahiro Shinagawa and Kazuhiko Kato",
  title =        "Improving Agility and Elasticity in Bare-metal
                 Clouds",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "43",
  number =       "1",
  pages =        "145--159",
  month =        mar,
  year =         "2015",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2786763.2694349",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Jun 3 11:27:38 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Bare-metal clouds are an emerging
                 infrastructure-as-a-service (IaaS) that leases physical
                 machines (bare-metal instances) rather than virtual
                 machines, allowing resource-intensive applications to
                 have exclusive access to physical hardware.
                 Unfortunately, bare-metal instances require
                 time-consuming or OS-specific tasks for deployment due
                 to the lack of virtualization layers, thereby
                 sacrificing several beneficial features of traditional
                 IaaS clouds such as agility, elasticity, and OS
                 transparency. We present BMcast, an OS deployment
                 system with a special-purpose de-virtualizable virtual
                 machine monitor (VMM) that supports quick and
                 OS-transparent startup of bare-metal instances. BMcast
                 performs streaming OS deployment while allowing direct
                 access to physical hardware from the guest OS, and then
                 disappears after completing the deployment. Quick
                 startup of instances improves agility and elasticity
                 significantly, and OS transparency greatly simplifies
                 management tasks for cloud customers. Experimental
                 results have confirmed that BMcast initiated a
                 bare-metal instance 8.6 times faster than image
                 copying, and database performance on BMcast during
                 streaming OS deployment was comparable to that on a
                 state-of-the-art VMM without performing deployment.
                 BMcast incurred zero overhead after
                 de-virtualization.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'15 conference proceedings.",
}

@Article{Haque:2015:FMI,
  author =       "Md E. Haque and Yong hun Eom and Yuxiong He and Sameh
                 Elnikety and Ricardo Bianchini and Kathryn S.
                 McKinley",
  title =        "Few-to-Many: Incremental Parallelism for Reducing Tail
                 Latency in Interactive Services",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "43",
  number =       "1",
  pages =        "161--175",
  month =        mar,
  year =         "2015",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2786763.2694384",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Jun 3 11:27:38 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Interactive services, such as Web search,
                 recommendations, games, and finance, must respond
                 quickly to satisfy customers. Achieving this goal
                 requires optimizing tail (e.g., 99th+ percentile)
                 latency. Although every server is multicore,
                 parallelizing individual requests to reduce tail
                 latency is challenging because (1) service demand is
                 unknown when requests arrive; (2) blindly parallelizing
                 all requests quickly oversubscribes hardware resources;
                 and (3) parallelizing the numerous short requests will
                 not improve tail latency. This paper introduces
                 Few-to-Many (FM) incremental parallelization, which
                 dynamically increases parallelism to reduce tail
                 latency. FM uses request service demand profiles and
                 hardware parallelism in an offline phase to compute a
                 policy, represented as an interval table, which
                 specifies when and how much software parallelism to
                 add. At runtime, FM adds parallelism as specified by
                 the interval table indexed by dynamic system load and
                 request execution time progress. The longer a request
                 executes, the more parallelism FM adds. We evaluate FM
                 in Lucene, an open-source enterprise search engine, and
                 in Bing, a commercial Web search engine. FM improves
                 the 99th percentile response time up to 32\% in Lucene
                 and up to 26\% in Bing, compared to prior
                 state-of-the-art parallelization. Compared to running
                 requests sequentially in Bing, FM improves tail latency
                 by a factor of two. These results illustrate that
                 incremental parallelism is a powerful tool for reducing
                 tail latency.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'15 conference proceedings.",
}

@Article{Colp:2015:PDS,
  author =       "Patrick Colp and Jiawen Zhang and James Gleeson and
                 Sahil Suneja and Eyal de Lara and Himanshu Raj and
                 Stefan Saroiu and Alec Wolman",
  title =        "Protecting Data on {Smartphones} and Tablets from
                 Memory Attacks",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "43",
  number =       "1",
  pages =        "177--189",
  month =        mar,
  year =         "2015",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2786763.2694380",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Jun 3 11:27:38 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Smartphones and tablets are easily lost or stolen.
                 This makes them susceptible to an inexpensive class of
                 memory attacks, such as cold-boot attacks, using a bus
                 monitor to observe the memory bus, and DMA attacks.
                 This paper describes Sentry, a system that allows
                 applications and OS components to store their code and
                 data on the System-on-Chip (SoC) rather than in DRAM.
                 We use ARM-specific mechanisms originally designed for
                 embedded systems, but still present in today's mobile
                 devices, to protect applications and OS subsystems from
                 memory attacks.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'15 conference proceedings.",
}

@Article{Dautenhahn:2015:NKO,
  author =       "Nathan Dautenhahn and Theodoros Kasampalis and Will
                 Dietz and John Criswell and Vikram Adve",
  title =        "Nested Kernel: an Operating System Architecture for
                 Intra-Kernel Privilege Separation",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "43",
  number =       "1",
  pages =        "191--206",
  month =        mar,
  year =         "2015",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2786763.2694386",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Jun 3 11:27:38 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Monolithic operating system designs undermine the
                 security of computing systems by allowing single
                 exploits anywhere in the kernel to enjoy full
                 supervisor privilege. The nested kernel operating
                 system architecture addresses this problem by
                 ``nesting'' a small isolated kernel within a
                 traditional monolithic kernel. The ``nested kernel''
                 interposes on all updates to virtual memory
                 translations to assert protections on physical memory,
                 thus significantly reducing the trusted computing base
                 for memory access control enforcement. We incorporated
                 the nested kernel architecture into FreeBSD on x86-64
                 hardware while allowing the entire operating system,
                 including untrusted components, to operate at the
                 highest hardware privilege level by write-protecting
                 MMU translations and de-privileging the untrusted part
                 of the kernel. Our implementation inherently enforces
                 kernel code integrity while still allowing dynamically
                 loaded kernel modules, thus defending against code
                 injection attacks. We also demonstrate that the nested
                 kernel architecture allows kernel developers to isolate
                 memory in ways not possible in monolithic kernels by
                 introducing write-mediation and write-logging services
                 to protect critical system data structures. Performance
                 of the nested kernel prototype shows modest overheads:
                 $< 1\%$ average for Apache and 2.7\% for kernel
                 compile. Overall, our results and experience show that
                 the nested kernel design can be retrofitted to existing
                 monolithic kernels, providing important security
                 benefits.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'15 conference proceedings.",
}

@Article{Tan:2015:DWS,
  author =       "Zhangxi Tan and Zhenghao Qian and Xi Chen and Krste
                 Asanovic and David Patterson",
  title =        "{DIABLO}: a Warehouse-Scale Computer Network Simulator
                 using {FPGAs}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "43",
  number =       "1",
  pages =        "207--221",
  month =        mar,
  year =         "2015",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2786763.2694362",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Jun 3 11:27:38 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Motivated by rapid software and hardware innovation in
                 warehouse-scale computing (WSC), we visit the problem
                 of warehouse-scale network design evaluation. A WSC is
                 composed of about 30 arrays or clusters, each of which
                 contains about 3000 servers, leading to a total of
                 about 100,000 servers per WSC. We found many prior
                 experiments have been conducted on relatively small
                 physical testbeds, and they often assume the workload
                 is static and that computations are only loosely
                 coupled with the adaptive networking stack. We present
                 a novel and cost-efficient FPGAbased evaluation
                 methodology, called Datacenter-In-A-Box at LOw cost
                 (DIABLO), which treats arrays as whole computers with
                 tightly integrated hardware and software. We have built
                 a 3,000-node prototype running the full WSC software
                 stack. Using our prototype, we have successfully
                 reproduced a few WSC phenomena, such as TCP Incast and
                 memcached request latency long tail, and found that
                 results do indeed change with both scale and with
                 version of the full software stack.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'15 conference proceedings.",
}

@Article{Hauswald:2015:SOE,
  author =       "Johann Hauswald and Michael A. Laurenzano and Yunqi
                 Zhang and Cheng Li and Austin Rovinski and Arjun
                 Khurana and Ronald G. Dreslinski and Trevor Mudge and
                 Vinicius Petrucci and Lingjia Tang and Jason Mars",
  title =        "{Sirius}: an Open End-to-End Voice and Vision Personal
                 Assistant and Its Implications for Future Warehouse
                 Scale Computers",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "43",
  number =       "1",
  pages =        "223--238",
  month =        mar,
  year =         "2015",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2786763.2694347",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Jun 3 11:27:38 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "As user demand scales for intelligent personal
                 assistants (IPAs) such as Apple's Siri, Google's Google
                 Now, and Microsoft's Cortana, we are approaching the
                 computational limits of current datacenter
                 architectures. It is an open question how future server
                 architectures should evolve to enable this emerging
                 class of applications, and the lack of an open-source
                 IPA workload is an obstacle in addressing this
                 question. In this paper, we present the design of
                 Sirius, an open end-to-end IPA web-service application
                 that accepts queries in the form of voice and images,
                 and responds with natural language. We then use this
                 workload to investigate the implications of four points
                 in the design space of future accelerator-based server
                 architectures spanning traditional CPUs, GPUs, manycore
                 throughput co-processors, and FPGAs. To investigate
                 future server designs for Sirius, we decompose Sirius
                 into a suite of 7 benchmarks (Sirius Suite) comprising
                 the computationally intensive bottlenecks of Sirius. We
                 port Sirius Suite to a spectrum of accelerator
                 platforms and use the performance and power trade-offs
                 across these platforms to perform a total cost of
                 ownership (TCO) analysis of various server design
                 points. In our study, we find that accelerators are
                 critical for the future scalability of IPA services.
                 Our results show that GPU- and FPGA-accelerated servers
                 improve the query latency on average by 10x and 16x.
                 For a given throughput, GPU- and FPGA-accelerated
                 servers can reduce the TCO of datacenters by 2.6x and
                 1.4x, respectively.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'15 conference proceedings.",
}

@Article{Xu:2015:ALD,
  author =       "Chao Xu and Felix Xiaozhu Lin and Yuyang Wang and Lin
                 Zhong",
  title =        "Automated {OS}-level Device Runtime Power Management",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "43",
  number =       "1",
  pages =        "239--252",
  month =        mar,
  year =         "2015",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2786763.2694360",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Jun 3 11:27:38 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Non-CPU devices on a modern system-on-a-chip (SoC),
                 ranging from accelerators to I/O controllers, account
                 for a significant portion of the chip area. It is
                 therefore vital for system energy efficiency that idle
                 devices can enter a low-power state while still meeting
                 the performance expectation. This is called device
                 runtime Power Management (PM) for which individual
                 device drivers in commodity OSes are held responsible
                 today. Based on the observations of existing drivers
                 and their evolution, we consider it harmful to rely on
                 drivers for device runtime PM. This paper identifies
                 three pieces of information as essential to device
                 runtime PM, and shows that they can be obtained without
                 involving drivers, either by using a software-only
                 approach, or more efficiently, by adding one register
                 bit to each device. We thus suggest a structural change
                 to the current Linux runtime PM framework, replacing
                 the PM code in all applicable drivers with a single
                 kernel module called the central PM agent. Experimental
                 evaluations show that the central PM agent is just as
                 effective as hand-tuned driver PM code. The paper also
                 presents a tool called PowerAdvisor that simplifies
                 driver PM efforts under the current Linux runtime PM
                 framework. PowerAdvisor analyzes execution traces and
                 suggests where to insert PM calls in driver source
                 code. Despite being a best-effort tool, PowerAdvisor
                 not only reproduces hand-tuned PM code from stock
                 drivers, but also correctly suggests PM code never
                 known before. Overall, our experience shows that it is
                 promising to ultimately free driver developers from
                 manual PM.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'15 conference proceedings.",
}

@Article{Goiri:2015:CTV,
  author =       "{\'I}{\~n}igo Goiri and Thu D. Nguyen and Ricardo
                 Bianchini",
  title =        "{CoolAir}: Temperature- and Variation-Aware Management
                 for Free-Cooled Datacenters",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "43",
  number =       "1",
  pages =        "253--265",
  month =        mar,
  year =         "2015",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2786763.2694378",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Jun 3 11:27:38 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Despite its benefits, free cooling may expose servers
                 to high absolute temperatures, wide temperature
                 variations, and high humidity when datacenters are
                 sited at certain locations. Prior research (in
                 non-free-cooled datacenters) has shown that high
                 temperatures and/or wide temporal temperature
                 variations can harm hardware reliability. In this
                 paper, we identify the runtime management strategies
                 required to limit absolute temperatures, temperature
                 variations, humidity, and cooling energy in free-cooled
                 datacenters. As the basis for our study, we propose
                 CoolAir, a system that embodies these strategies. Using
                 CoolAir and a real free-cooled datacenter prototype, we
                 show that effective management requires cooling
                 infrastructures that can act smoothly. In addition, we
                 show that CoolAir can tightly manage temperature and
                 significantly reduce temperature variation, often at a
                 lower cooling cost than existing free-cooled
                 datacenters. Perhaps most importantly, based on our
                 results, we derive several principles and lessons that
                 should guide the design of management systems for
                 free-cooled datacenters of any size.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'15 conference proceedings.",
}

@Article{Mishra:2015:PGM,
  author =       "Nikita Mishra and Huazhe Zhang and John D. Lafferty
                 and Henry Hoffmann",
  title =        "A Probabilistic Graphical Model-based Approach for
                 Minimizing Energy Under Performance Constraints",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "43",
  number =       "1",
  pages =        "267--281",
  month =        mar,
  year =         "2015",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2786763.2694373",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Jun 3 11:27:38 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "In many deployments, computer systems are
                 underutilized --- meaning that applications have
                 performance requirements that demand less than full
                 system capacity. Ideally, we would take advantage of
                 this under-utilization by allocating system resources
                 so that the performance requirements are met and energy
                 is minimized. This optimization problem is complicated
                 by the fact that the performance and power consumption
                 of various system configurations are often application
                 --- or even input --- dependent. Thus, practically,
                 minimizing energy for a performance constraint requires
                 fast, accurate estimations of application-dependent
                 performance and power tradeoffs. This paper
                 investigates machine learning techniques that enable
                 energy savings by learning Pareto-optimal power and
                 performance tradeoffs. Specifically, we propose LEO, a
                 probabilistic graphical model-based learning system
                 that provides accurate online estimates of an
                 application's power and performance as a function of
                 system configuration. We compare LEO to (1) offline
                 learning, (2) online learning, (3) a heuristic
                 approach, and (4) the true optimal solution. We find
                 that LEO produces the most accurate estimates and near
                 optimal energy savings.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'15 conference proceedings.",
}

@Article{Pang:2015:MLL,
  author =       "Jun Pang and Chris Dwyer and Alvin R. Lebeck",
  title =        "More is Less, Less is More: Molecular-Scale Photonic
                 {NoC} Power Topologies",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "43",
  number =       "1",
  pages =        "283--296",
  month =        mar,
  year =         "2015",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2786763.2694377",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Jun 3 11:27:38 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Molecular-scale Network-on-Chip (mNoC) crossbars use
                 quantum dot LEDs as an on-chip light source, and
                 chromophores to provide optical signal filtering for
                 receivers. An mNoC reduces power consumption or enables
                 scaling to larger crossbars for a reduced energy budget
                 compared to current nanophotonic NoC crossbars. Since
                 communication latency is reduced by using a high-radix
                 crossbar, minimizing power consumption becomes a
                 primary design target. Conventional Single Writer
                 Multiple Reader (SWMR) photonic crossbar designs
                 broadcast all packets, and incur the commensurate
                 required power, even if only two nodes are
                 communicating. This paper introduces power topologies,
                 enabled by unique capabilities of mNoC technology, to
                 reduce overall interconnect power consumption. A power
                 topology corresponds to the logical connectivity
                 provided by a given power mode. Broadcast is one power
                 mode and it consumes the maximum power. Additional
                 power modes consume less power but allow a source to
                 communicate with only a statically defined, potentially
                 non-contiguous, subset of nodes. Overall interconnect
                 power is reduced if the more frequently communicating
                 nodes use modes that consume less power, while less
                 frequently communicating nodes use modes that consume
                 more power. We also investigate thread mapping
                 techniques to fully exploit power topologies. We
                 explore various mNoC power topologies with one, two and
                 four power modes for a radix-256 SWMR mNoC crossbar.
                 Our results show that the combination of power
                 topologies and intelligent thread mapping can reduce
                 total mNoC power by up to 51\% on average for a set of
                 12 SPLASH benchmarks. Furthermore performance is 10\%
                 better than conventional resonator-based photonic NoCs
                 and energy is reduced by 72\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'15 conference proceedings.",
}

@Article{Sridharan:2015:MEM,
  author =       "Vilas Sridharan and Nathan DeBardeleben and Sean
                 Blanchard and Kurt B. Ferreira and Jon Stearley and
                 John Shalf and Sudhanva Gurumurthi",
  title =        "Memory Errors in Modern Systems: The Good, The Bad,
                 and The Ugly",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "43",
  number =       "1",
  pages =        "297--310",
  month =        mar,
  year =         "2015",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2786763.2694348",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Jun 3 11:27:38 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Several recent publications have shown that hardware
                 faults in the memory subsystem are commonplace. These
                 faults are predicted to become more frequent in future
                 systems that contain orders of magnitude more DRAM and
                 SRAM than found in current memory subsystems. These
                 memory subsystems will need to provide resilience
                 techniques to tolerate these faults when deployed in
                 high-performance computing systems and data centers
                 containing tens of thousands of nodes. Therefore, it is
                 critical to understand the efficacy of current hardware
                 resilience techniques to determine whether they will be
                 suitable for future systems. In this paper, we present
                 a study of DRAM and SRAM faults and errors from the
                 field. We use data from two leadership-class
                 high-performance computer systems to analyze the
                 reliability impact of hardware resilience schemes that
                 are deployed in current systems. Our study has several
                 key findings about the efficacy of many currently
                 deployed reliability techniques such as DRAM ECC, DDR
                 address/command parity, and SRAM ECC and parity. We
                 also perform a methodological study, and find that
                 counting errors instead of faults, a common practice
                 among researchers and data center operators, can lead
                 to incorrect conclusions about system reliability.
                 Finally, we use our data to project the needs of future
                 large-scale systems. We find that SRAM faults are
                 unlikely to pose a significantly larger reliability
                 threat in the future, while DRAM faults will be a major
                 concern and stronger DRAM resilience schemes will be
                 needed to maintain acceptable failure rates similar to
                 those found on today's systems.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'15 conference proceedings.",
}

@Article{Yetim:2015:CMC,
  author =       "Yavuz Yetim and Sharad Malik and Margaret Martonosi",
  title =        "{CommGuard}: Mitigating Communication Errors in
                 Error-Prone Parallel Execution",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "43",
  number =       "1",
  pages =        "311--323",
  month =        mar,
  year =         "2015",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2786763.2694354",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Jun 3 11:27:38 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "As semiconductor technology scales towards
                 ever-smaller transistor sizes, hardware fault rates are
                 increasing. Since important application classes (e.g.,
                 multimedia, streaming workloads) are
                 data-error-tolerant, recent research has proposed
                 techniques that seek to save energy or improve yield by
                 exploiting error tolerance at the
                 architecture/microarchitecture level. Even seemingly
                 error-tolerant applications, however, will crash or
                 hang due to control-flow/memory addressing errors. In
                 parallel computation, errors involving inter-thread
                 communication can have equally catastrophic effects.
                 Our work explores techniques that mitigate the impact
                 of potentially catastrophic errors in parallel
                 computation, while still garnering power, cost, or
                 yield benefits from data error tolerance. Our proposed
                 CommGuard solution uses FSM-based checkers to pad and
                 discard data in order to maintain semantic alignment
                 between program control flow and the data communicated
                 between processors. CommGuard techniques are low
                 overhead and they exploit application information
                 already provided by some parallel programming languages
                 (e.g. StreamIt). By converting potentially catastrophic
                 communication errors into potentially tolerable data
                 errors, CommGuard allows important streaming
                 applications like JPEG and MP3 decoding to execute
                 without crashing and to sustain good output quality,
                 even for errors as frequent as every 500 $ \mu $ s.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'15 conference proceedings.",
}

@Article{Kim:2015:DEF,
  author =       "Dohyeong Kim and Yonghwi Kwon and William N. Sumner
                 and Xiangyu Zhang and Dongyan Xu",
  title =        "Dual Execution for On the Fly Fine Grained Execution
                 Comparison",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "43",
  number =       "1",
  pages =        "325--338",
  month =        mar,
  year =         "2015",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2786763.2694394",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Jun 3 11:27:38 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Execution comparison has many applications in
                 debugging, malware analysis, software feature
                 identification, and intrusion detection. Existing
                 comparison techniques have various limitations. Some
                 can only compare at the system event level and require
                 executions to take the same input. Some require storing
                 instruction traces that are very space-consuming and
                 have difficulty dealing with non-determinism. In this
                 paper, we propose a novel dual execution technique that
                 allows on-the-fly comparison at the instruction level.
                 Only differences between the executions are recorded.
                 It allows executions to proceed in a coupled mode such
                 that they share the same input sequence with the same
                 timing, reducing nondeterminism. It also allows them to
                 proceed in a decoupled mode such that the user can
                 interact with each one differently. Decoupled
                 executions can be recoupled to share the same future
                 inputs and facilitate further comparison. We have
                 implemented a prototype and applied it to identifying
                 functional components for reuse, comparative debugging
                 with new GDB primitives, and understanding real world
                 regression failures. Our results show that dual
                 execution is a critical enabling technique for
                 execution comparison.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'15 conference proceedings.",
}

@Article{Hosek:2015:VUE,
  author =       "Petr Hosek and Cristian Cadar",
  title =        "{VARAN} the Unbelievable: an Efficient {$N$}-version
                 Execution Framework",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "43",
  number =       "1",
  pages =        "339--353",
  month =        mar,
  year =         "2015",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2786763.2694390",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Jun 3 11:27:38 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "With the widespread availability of multi-core
                 processors, running multiple diversified variants or
                 several different versions of an application in
                 parallel is becoming a viable approach for increasing
                 the reliability and security of software systems. The
                 key component of such N-version execution (NVX) systems
                 is a runtime monitor that enables the execution of
                 multiple versions in parallel. Unfortunately, existing
                 monitors impose either a large performance overhead or
                 rely on intrusive kernel-level changes. Moreover, none
                 of the existing solutions scales well with the number
                 of versions, since the runtime monitor acts as a
                 performance bottleneck. In this paper, we introduce
                 Varan, an NVX framework that combines selective binary
                 rewriting with a novel event-streaming architecture to
                 significantly reduce performance overhead and scale
                 well with the number of versions, without relying on
                 intrusive kernel modifications. Our evaluation shows
                 that Varan can run NVX systems based on popular C10k
                 network servers with only a modest performance
                 overhead, and can be effectively used to increase
                 software reliability using techniques such as
                 transparent failover, live sanitization and
                 multi-revision execution.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'15 conference proceedings.",
}

@Article{Malka:2015:REI,
  author =       "Moshe Malka and Nadav Amit and Muli Ben-Yehuda and Dan
                 Tsafrir",
  title =        "{rIOMMU}: Efficient {IOMMU} for {I/O} Devices that
                 Employ Ring Buffers",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "43",
  number =       "1",
  pages =        "355--368",
  month =        mar,
  year =         "2015",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2786763.2694355",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Jun 3 11:27:38 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "The IOMMU allows the OS to encapsulate I/O devices in
                 their own virtual memory spaces, thus restricting their
                 DMAs to specific memory pages. The OS uses the IOMMU to
                 protect itself against buggy drivers and
                 malicious/errant devices. But the added protection
                 comes at a cost, degrading the throughput of
                 I/O-intensive workloads by up to an order of magnitude.
                 This cost has motivated system designers to trade off
                 some safety for performance, e.g., by leaving stale
                 information in the IOTLB for a while so as to amortize
                 costly invalidations. We observe that high-bandwidth
                 devices---like network and PCIe SSD
                 controllers---interact with the OS via circular ring
                 buffers that induce a sequential, predictable workload.
                 We design a ring IOMMU (rIOMMU) that leverages this
                 characteristic by replacing the virtual memory page
                 table hierarchy with a circular, flat table. A flat
                 table is adequately supported by exactly one IOTLB
                 entry, making every new translation an implicit
                 invalidation of the former and thus requiring explicit
                 invalidations only at the end of I/O bursts. Using
                 standard networking benchmarks, we show that rIOMMU
                 provides up to 7.56x higher throughput relative to the
                 baseline IOMMU, and that it is within 0.77--1.00x the
                 throughput of a system without IOMMU protection.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'15 conference proceedings.",
}

@Article{Liu:2015:PPM,
  author =       "Daofu Liu and Tianshi Chen and Shaoli Liu and Jinhong
                 Zhou and Shengyuan Zhou and Olivier Teman and Xiaobing
                 Feng and Xuehai Zhou and Yunji Chen",
  title =        "{PuDianNao}: a Polyvalent Machine Learning
                 Accelerator",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "43",
  number =       "1",
  pages =        "369--381",
  month =        mar,
  year =         "2015",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2786763.2694358",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Jun 3 11:27:38 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Machine Learning (ML) techniques are pervasive tools
                 in various emerging commercial applications, but have
                 to be accommodated by powerful computer systems to
                 process very large data. Although general-purpose CPUs
                 and GPUs have provided straightforward solutions, their
                 energy-efficiencies are limited due to their excessive
                 supports for flexibility. Hardware accelerators may
                 achieve better energy-efficiencies, but each
                 accelerator often accommodates only a single ML
                 technique (family). According to the famous
                 No-Free-Lunch theorem in the ML domain, however, an ML
                 technique performs well on a dataset may perform poorly
                 on another dataset, which implies that such accelerator
                 may sometimes lead to poor learning accuracy. Even if
                 regardless of the learning accuracy, such accelerator
                 can still become inapplicable simply because the
                 concrete ML task is altered, or the user chooses
                 another ML technique. In this study, we present an ML
                 accelerator called PuDianNao, which accommodates seven
                 representative ML techniques, including $k$-means,
                 $k$-nearest neighbors, naive Bayes, support vector
                 machine, linear regression, classification tree, and
                 deep neural network. Benefited from our thorough
                 analysis on computational primitives and locality
                 properties of different ML techniques, PuDianNao can
                 perform up to 1056 GOP/s (e.g., additions and
                 multiplications) in an area of 3.51 mm^2, and consumes
                 596 mW only. Compared with the NVIDIA K20M GPU (28nm
                 process), PuDianNao (65nm process) is 1.20x faster, and
                 can reduce the energy by 128.41x.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'15 conference proceedings.",
}

@Article{Goiri:2015:ABA,
  author =       "Inigo Goiri and Ricardo Bianchini and Santosh
                 Nagarakatte and Thu D. Nguyen",
  title =        "{ApproxHadoop}: Bringing Approximations to {MapReduce}
                 Frameworks",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "43",
  number =       "1",
  pages =        "383--397",
  month =        mar,
  year =         "2015",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2786763.2694351",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Jun 3 11:27:38 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "We propose and evaluate a framework for creating and
                 running approximation-enabled MapReduce programs.
                 Specifically, we propose approximation mechanisms that
                 fit naturally into the MapReduce paradigm, including
                 input data sampling, task dropping, and accepting and
                 running a precise and a user-defined approximate
                 version of the MapReduce code. We then show how to
                 leverage statistical theories to compute error bounds
                 for popular classes of MapReduce programs when
                 approximating with input data sampling and/or task
                 dropping. We implement the proposed mechanisms and
                 error bound estimations in a prototype system called
                 ApproxHadoop. Our evaluation uses MapReduce
                 applications from different domains, including data
                 analytics, scientific computing, video encoding, and
                 machine learning. Our results show that ApproxHadoop
                 can significantly reduce application execution time
                 and/or energy consumption when the user is willing to
                 tolerate small errors. For example, ApproxHadoop can
                 reduce runtimes by up to 32x when the user can tolerate
                 an error of 1\% with 95\% confidence. We conclude that
                 our framework and system can make approximation easily
                 accessible to many application domains using the
                 MapReduce model.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'15 conference proceedings.",
}

@Article{Ringenburg:2015:MDQ,
  author =       "Michael Ringenburg and Adrian Sampson and Isaac
                 Ackerman and Luis Ceze and Dan Grossman",
  title =        "Monitoring and Debugging the Quality of Results in
                 Approximate Programs",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "43",
  number =       "1",
  pages =        "399--411",
  month =        mar,
  year =         "2015",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2786763.2694365",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Jun 3 11:27:38 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Energy efficiency is a key concern in the design of
                 modern computer systems. One promising approach to
                 energy-efficient computation, approximate computing,
                 trades off output accuracy for significant gains in
                 energy efficiency. However, debugging the actual cause
                 of output quality problems in approximate programs is
                 challenging. This paper presents dynamic techniques to
                 debug and monitor the quality of approximate
                 computations. We propose both offline debugging tools
                 that instrument code to determine the key sources of
                 output degradation and online approaches that monitor
                 the quality of deployed applications. We present two
                 offline debugging techniques and three online
                 monitoring mechanisms. The first offline tool
                 identifies correlations between output quality and the
                 execution of individual approximate operations. The
                 second tracks approximate operations that flow into a
                 particular value. Our online monitoring mechanisms are
                 complementary approaches designed for detecting quality
                 problems in deployed applications, while still
                 maintaining the energy savings from approximation. We
                 present implementations of our techniques and describe
                 their usage with seven applications. Our online
                 monitors control output quality while still maintaining
                 significant energy efficiency gains, and our offline
                 tools provide new insights into the effects of
                 approximation on output quality.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'15 conference proceedings.",
}

@Article{Banavar:2015:WEC,
  author =       "Guruduth Banavar",
  title =        "{Watson} and the Era of Cognitive Computing",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "43",
  number =       "1",
  pages =        "413--413",
  month =        mar,
  year =         "2015",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2786763.2694376",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Jun 3 11:27:38 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "In the last decade, the availability of massive
                 amounts of new data, and the development of new machine
                 learning technologies, have augmented reasoning systems
                 to give rise to a new class of computing systems. These
                 ``Cognitive Systems'' learn from data, reason from
                 models, and interact naturally with us, to perform
                 complex tasks better than either humans or machines can
                 do by themselves. In essence, cognitive systems help us
                 perform like the best by penetrating the complexity of
                 big data and leverage the power of models. One of the
                 first cognitive systems, called Watson, demonstrated
                 through a Jeopardy! exhibition match, that it was
                 capable of answering complex factoid questions as
                 effectively as the world's champions. Follow-on
                 cognitive systems perform other tasks, such as
                 discovery, reasoning, and multi-modal understanding in
                 a variety of domains, such as healthcare, insurance,
                 and education. We believe such cognitive systems will
                 transform every industry and our everyday life for the
                 better. In this talk, I will give an overview of the
                 applications, the underlying capabilities, and some of
                 the key challenges, of cognitive systems.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'15 conference proceedings.",
}

@Article{Stewart:2015:ZDW,
  author =       "Gordon Stewart and Mahanth Gowda and Geoffrey Mainland
                 and Bozidar Radunovic and Dimitrios Vytiniotis and
                 Cristina Luengo Agullo",
  title =        "{Ziria}: a {DSL} for Wireless Systems Programming",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "43",
  number =       "1",
  pages =        "415--428",
  month =        mar,
  year =         "2015",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2786763.2694368",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Jun 3 11:27:38 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Software-defined radio (SDR) brings the flexibility of
                 software to wireless protocol design, promising an
                 ideal platform for innovation and rapid protocol
                 deployment. However, implementing modern wireless
                 protocols on existing SDR platforms often requires
                 careful hand-tuning of low-level code, which can
                 undermine the advantages of software. Ziria is a new
                 domain-specific language (DSL) that offers programming
                 abstractions suitable for wireless physical (PHY) layer
                 tasks while emphasizing the pipeline reconfiguration
                 aspects of PHY programming. The Ziria compiler
                 implements a rich set of specialized optimizations,
                 such as lookup table generation and pipeline fusion. We
                 also offer a novel --- due to pipeline reconfiguration
                 --- algorithm to optimize the data widths of
                 computations in Ziria pipelines. We demonstrate the
                 programming flexibility of Ziria and the performance of
                 the generated code through a detailed evaluation of a
                 line-rate Ziria WiFi 802.11a/g implementation that is
                 on par and in many cases outperforms a hand-tuned
                 state-of-the-art C++ implementation on commodity
                 CPUs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'15 conference proceedings.",
}

@Article{Mullapudi:2015:PAO,
  author =       "Ravi Teja Mullapudi and Vinay Vasista and Uday
                 Bondhugula",
  title =        "{PolyMage}: Automatic Optimization for Image
                 Processing Pipelines",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "43",
  number =       "1",
  pages =        "429--443",
  month =        mar,
  year =         "2015",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2786763.2694364",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Jun 3 11:27:38 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "This paper presents the design and implementation of
                 PolyMage, a domain-specific language and compiler for
                 image processing pipelines. An image processing
                 pipeline can be viewed as a graph of interconnected
                 stages which process images successively. Each stage
                 typically performs one of point-wise, stencil,
                 reduction or data-dependent operations on image pixels.
                 Individual stages in a pipeline typically exhibit
                 abundant data parallelism that can be exploited with
                 relative ease. However, the stages also require high
                 memory bandwidth preventing effective utilization of
                 parallelism available on modern architectures. For
                 applications that demand high performance, the
                 traditional options are to use optimized libraries like
                 OpenCV or to optimize manually. While using libraries
                 precludes optimization across library routines, manual
                 optimization accounting for both parallelism and
                 locality is very tedious. The focus of our system,
                 PolyMage, is on automatically generating
                 high-performance implementations of image processing
                 pipelines expressed in a high-level declarative
                 language. Our optimization approach primarily relies on
                 the transformation and code generation capabilities of
                 the polyhedral compiler framework. To the best of our
                 knowledge, this is the first model-driven compiler for
                 image processing pipelines that performs complex
                 fusion, tiling, and storage optimization automatically.
                 Experimental results on a modern multicore system show
                 that the performance achieved by our automatic approach
                 is up to 1.81x better than that achieved through manual
                 tuning in Halide, a state-of-the-art language and
                 compiler for image processing pipelines. For a camera
                 raw image processing pipeline, our performance is
                 comparable to that of a hand-tuned implementation.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'15 conference proceedings.",
}

@Article{Heckey:2015:CMC,
  author =       "Jeff Heckey and Shruti Patil and Ali JavadiAbhari and
                 Adam Holmes and Daniel Kudrow and Kenneth R. Brown and
                 Diana Franklin and Frederic T. Chong and Margaret
                 Martonosi",
  title =        "Compiler Management of Communication and Parallelism
                 for Quantum Computation",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "43",
  number =       "1",
  pages =        "445--456",
  month =        mar,
  year =         "2015",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2786763.2694357",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Jun 3 11:27:38 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Quantum computing (QC) offers huge promise to
                 accelerate a range of computationally intensive
                 benchmarks. Quantum computing is limited, however, by
                 the challenges of decoherence: i.e., a quantum state
                 can only be maintained for short windows of time before
                 it decoheres. While quantum error correction codes can
                 protect against decoherence, fast execution time is the
                 best defense against decoherence, so efficient
                 architectures and effective scheduling algorithms are
                 necessary. This paper proposes the Multi-SIMD QC
                 architecture and then proposes and evaluates effective
                 schedulers to map benchmark descriptions onto
                 Multi-SIMD architectures. The Multi-SIMD model consists
                 of a small number of SIMD regions, each of which may
                 support operations on up to thousands of qubits per
                 cycle. Efficient Multi-SIMD operation requires
                 efficient scheduling. This work develops schedulers to
                 reduce communication requirements of qubits between
                 operating regions, while also improving parallelism.We
                 find that communication to global memory is a dominant
                 cost in QC. We also note that many quantum benchmarks
                 have long serial operation paths (although each
                 operation may be data parallel). To exploit this
                 characteristic, we introduce Longest-Path-First
                 Scheduling (LPFS) which pins operations to SIMD regions
                 to keep data in-place and reduce communication to
                 memory. The use of small, local scratchpad memories
                 also further reduces communication. Our results show a
                 3\% to 308\% improvement for LPFS over conventional
                 scheduling algorithms, and an additional 3\% to 64\%
                 improvement using scratchpad memories. Our work is the
                 most comprehensive software-to-quantum toolflow
                 published to date, with efficient and practical
                 scheduling techniques that reduce communication and
                 increase parallelism for full-scale quantum code
                 executing up to a trillion quantum gate operations.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'15 conference proceedings.",
}

@Article{Hassaan:2015:KDG,
  author =       "Muhammad Amber Hassaan and Donald D. Nguyen and Keshav
                 K. Pingali",
  title =        "Kinetic Dependence Graphs",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "43",
  number =       "1",
  pages =        "457--471",
  month =        mar,
  year =         "2015",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2786763.2694363",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Jun 3 11:27:38 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Task graphs or dependence graphs are used in runtime
                 systems to schedule tasks for parallel execution. In
                 problem domains such as dense linear algebra and signal
                 processing, dependence graphs can be generated from a
                 program by static analysis. However, in emerging
                 problem domains such as graph analytics, the set of
                 tasks and dependences between tasks in a program are
                 complex functions of runtime values and cannot be
                 determined statically. In this paper, we introduce a
                 novel approach for exploiting parallelism in such
                 programs. This approach is based on a data structure
                 called the kinetic dependence graph (KDG), which
                 consists of a dependence graph together with update
                 rules that incrementally update the graph to reflect
                 changes in the dependence structure whenever a task is
                 completed. We have implemented a simple programming
                 model that allows programmers to write these
                 applications at a high level of abstraction, and a
                 runtime within the Galois system [15] that builds the
                 KDG automatically and executes the program in parallel.
                 On a suite of programs that are difficult to
                 parallelize otherwise, we have obtained speedups of up
                 to 33 on 40 cores, out-performing third-party
                 implementations in many cases.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'15 conference proceedings.",
}

@Article{Sidiroglou-Douskos:2015:TAI,
  author =       "Stelios Sidiroglou-Douskos and Eric Lahtinen and
                 Nathan Rittenhouse and Paolo Piselli and Fan Long and
                 Deokhwan Kim and Martin Rinard",
  title =        "Targeted Automatic Integer Overflow Discovery Using
                 Goal-Directed Conditional Branch Enforcement",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "43",
  number =       "1",
  pages =        "473--486",
  month =        mar,
  year =         "2015",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2786763.2694389",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Jun 3 11:27:38 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "We present a new technique and system, DIODE, for
                 auto- matically generating inputs that trigger
                 overflows at memory allocation sites. DIODE is designed
                 to identify relevant sanity checks that inputs must
                 satisfy to trigger overflows at target memory
                 allocation sites, then generate inputs that satisfy
                 these sanity checks to successfully trigger the
                 overflow. DIODE works with off-the-shelf, production
                 x86 binaries. Our results show that, for our benchmark
                 set of applications, and for every target memory
                 allocation site exercised by our seed inputs (which the
                 applications process correctly with no overflows),
                 either (1) DIODE is able to generate an input that
                 triggers an overflow at that site or (2) there is no
                 input that would trigger an overflow for the observed
                 target expression at that site.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'15 conference proceedings.",
}

@Article{Dhawan:2015:ASS,
  author =       "Udit Dhawan and Catalin Hritcu and Raphael Rubin and
                 Nikos Vasilakis and Silviu Chiricescu and Jonathan M.
                 Smith and Thomas F. {Knight, Jr.} and Benjamin C.
                 Pierce and Andre DeHon",
  title =        "Architectural Support for Software-Defined Metadata
                 Processing",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "43",
  number =       "1",
  pages =        "487--502",
  month =        mar,
  year =         "2015",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2786763.2694383",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Jun 3 11:27:38 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Optimized hardware for propagating and checking
                 software-programmable metadata tags can achieve low
                 runtime overhead. We generalize prior work on hardware
                 tagging by considering a generic architecture that
                 supports software-defined policies over metadata of
                 arbitrary size and complexity; we introduce several
                 novel microarchitectural optimizations that keep the
                 overhead of this rich processing low. Our model thus
                 achieves the efficiency of previous hardware-based
                 approaches with the flexibility of the software-based
                 ones. We demonstrate this by using it to enforce four
                 diverse safety and security policies---spatial and
                 temporal memory safety, taint tracking, control-flow
                 integrity, and code and data separation---plus a
                 composite policy that enforces all of them
                 simultaneously. Experiments on SPEC CPU2006 benchmarks
                 with a PUMP-enhanced RISC processor show modest impact
                 on runtime (typically under 10\%) and power ceiling
                 (less than 10\%), in return for some increase in energy
                 usage (typically under 60\%) and area for on-chip
                 memory structures (110\%).",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'15 conference proceedings.",
}

@Article{Zhang:2015:HDL,
  author =       "Danfeng Zhang and Yao Wang and G. Edward Suh and
                 Andrew C. Myers",
  title =        "A Hardware Design Language for Timing-Sensitive
                 Information-Flow Security",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "43",
  number =       "1",
  pages =        "503--516",
  month =        mar,
  year =         "2015",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2786763.2694372",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Jun 3 11:27:38 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Information security can be compromised by leakage via
                 low-level hardware features. One recently prominent
                 example is cache probing attacks, which rely on timing
                 channels created by caches. We introduce a hardware
                 design language, SecVerilog, which makes it possible to
                 statically analyze information flow at the hardware
                 level. With SecVerilog, systems can be built with
                 verifiable control of timing channels and other
                 information channels. SecVerilog is Verilog, extended
                 with expressive type annotations that enable precise
                 reasoning about information flow. It also comes with
                 rigorous formal assurance: we prove that SecVerilog
                 enforces timing-sensitive noninterference and thus
                 ensures secure information flow. By building a secure
                 MIPS processor and its caches, we demonstrate that
                 SecVerilog makes it possible to build complex hardware
                 designs with verified security, yet with low overhead
                 in time, space, and HW designer effort.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'15 conference proceedings.",
}

@Article{Hicks:2015:SLR,
  author =       "Matthew Hicks and Cynthia Sturton and Samuel T. King
                 and Jonathan M. Smith",
  title =        "{SPECS}: a Lightweight Runtime Mechanism for
                 Protecting Software from Security-Critical Processor
                 Bugs",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "43",
  number =       "1",
  pages =        "517--529",
  month =        mar,
  year =         "2015",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2786763.2694366",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Jun 3 11:27:38 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Processor implementation errata remain a problem, and
                 worse, a subset of these bugs are security-critical. We
                 classified 7 years of errata from recent commercial
                 processors to understand the magnitude and severity of
                 this problem, and found that of 301 errata analyzed, 28
                 are security-critical. We propose the SECURITY-CRITICAL
                 PROCESSOR ER- RATA CATCHING SYSTEM (SPECS) as a
                 low-overhead solution to this problem. SPECS employs a
                 dynamic verification strategy that is made lightweight
                 by limiting protection to only security-critical
                 processor state. As a proof-of- concept, we implement a
                 hardware prototype of SPECS in an open source
                 processor. Using this prototype, we evaluate SPECS
                 against a set of 14 bugs inspired by the types of
                 security-critical errata we discovered in the
                 classification phase. The evaluation shows that SPECS
                 is 86\% effective as a defense when deployed using only
                 ISA-level state; incurs less than 5\% area and power
                 overhead; and has no software run-time overhead.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'15 conference proceedings.",
}

@Article{Duan:2015:AMF,
  author =       "Yuelu Duan and Nima Honarmand and Josep Torrellas",
  title =        "Asymmetric Memory Fences: Optimizing Both Performance
                 and Implementability",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "43",
  number =       "1",
  pages =        "531--543",
  month =        mar,
  year =         "2015",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2786763.2694388",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Jun 3 11:27:38 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "There have been several recent efforts to improve the
                 performance of fences. The most aggressive designs
                 allow post-fence accesses to retire and complete before
                 the fence completes. Unfortunately, such designs
                 present implementation difficulties due to their
                 reliance on global state and structures. This paper's
                 goal is to optimize both the performance and the
                 implementability of fences. We start-off with a design
                 like the most aggressive ones but without the global
                 state. We call it Weak Fence or wF. Since the
                 concurrent execution of multiple wFs can deadlock, we
                 combine wFs with a conventional fence (i.e., Strong
                 Fence or sF) for the less performance-critical
                 thread(s). We call the result an Asymmetric fence
                 group. We also propose a taxonomy of Asymmetric fence
                 groups under TSO. Compared to past aggressive fences,
                 Asymmetric fence groups both are substantially easier
                 to implement and have higher average performance. The
                 two main designs presented (WS+ and W+) speed-up
                 workloads under TSO by an average of 13\% and 21\%,
                 respectively, over conventional fences.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'15 conference proceedings.",
}

@Article{Sung:2015:DES,
  author =       "Hyojin Sung and Sarita V. Adve",
  title =        "{DeNovoSync}: Efficient Support for Arbitrary
                 Synchronization without Writer-Initiated
                 Invalidations",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "43",
  number =       "1",
  pages =        "545--559",
  month =        mar,
  year =         "2015",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2786763.2694356",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Jun 3 11:27:38 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Current shared-memory hardware is complex and
                 inefficient. Prior work on the DeNovo coherence
                 protocol showed that disciplined shared-memory
                 programming models can enable more complexity-,
                 performance-, and energy-efficient hardware than the
                 state-of-the-art MESI protocol. DeNovo, however,
                 severely restricted the synchronization constructs an
                 application can support. This paper proposes
                 DeNovoSync, a technique to support arbitrary
                 synchronization in DeNovo. The key challenge is that
                 DeNovo exploits race-freedom to use reader-initiated
                 local self-invalidations (instead of conventional
                 writer-initiated remote cache invalidations) to ensure
                 coherence. Synchronization accesses are inherently racy
                 and not directly amenable to self-invalidations.
                 DeNovoSync addresses this challenge using a novel
                 combination of registration of all synchronization
                 reads with a judicious hardware backoff to limit
                 unnecessary registrations. For a wide variety of
                 synchronization constructs and applications, compared
                 to MESI, DeNovoSync shows comparable or up to 22\%
                 lower execution time and up to 58\% lower network
                 traffic, enabling DeNovo's advantages for a much
                 broader class of software than previously possible.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'15 conference proceedings.",
}

@Article{Sengupta:2015:HSD,
  author =       "Aritra Sengupta and Swarnendu Biswas and Minjia Zhang
                 and Michael D. Bond and Milind Kulkarni",
  title =        "Hybrid Static-Dynamic Analysis for Statically Bounded
                 Region Serializability",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "43",
  number =       "1",
  pages =        "561--575",
  month =        mar,
  year =         "2015",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2786763.2694379",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Jun 3 11:27:38 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Data races are common. They are difficult to detect,
                 avoid, or eliminate, and programmers sometimes
                 introduce them intentionally. However, shared-memory
                 programs with data races have unexpected, erroneous
                 behaviors. Intentional and unintentional data races
                 lead to atomicity and sequential consistency (SC)
                 violations, and they make it more difficult to
                 understand, test, and verify software. Existing
                 approaches for providing stronger guarantees for racy
                 executions add high run-time overhead and/or rely on
                 custom hardware. This paper shows how to provide
                 stronger semantics for racy programs while providing
                 relatively good performance on commodity systems. A
                 novel hybrid static--dynamic analysis called
                 \emph{EnfoRSer} provides end-to-end support for a
                 memory model called \emph{statically bounded region
                 serializability} (SBRS) that is not only stronger than
                 weak memory models but is strictly stronger than SC.
                 EnfoRSer uses static compiler analysis to transform
                 regions, and dynamic analysis to detect and resolve
                 conflicts at run time. By demonstrating commodity
                 support for a reasonably strong memory model with
                 reasonable overheads, we show its potential as an
                 always-on execution model.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'15 conference proceedings.",
}

@Article{Alglave:2015:GCW,
  author =       "Jade Alglave and Mark Batty and Alastair F. Donaldson
                 and Ganesh Gopalakrishnan and Jeroen Ketema and Daniel
                 Poetzl and Tyler Sorensen and John Wickerson",
  title =        "{GPU} Concurrency: Weak Behaviours and Programming
                 Assumptions",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "43",
  number =       "1",
  pages =        "577--591",
  month =        mar,
  year =         "2015",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2786763.2694391",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Jun 3 11:27:38 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Concurrency is pervasive and perplexing, particularly
                 on graphics processing units (GPUs). Current
                 specifications of languages and hardware are
                 inconclusive; thus programmers often rely on folklore
                 assumptions when writing software. To remedy this state
                 of affairs, we conducted a large empirical study of the
                 concurrent behaviour of deployed GPUs. Armed with
                 litmus tests (i.e. short concurrent programs), we
                 questioned the assumptions in programming guides and
                 vendor documentation about the guarantees provided by
                 hardware. We developed a tool to generate thousands of
                 litmus tests and run them under stressful workloads. We
                 observed a litany of previously elusive weak
                 behaviours, and exposed folklore beliefs about GPU
                 programming---often supported by official
                 tutorials---as false. As a way forward, we propose a
                 model of Nvidia GPU hardware, which correctly models
                 every behaviour witnessed in our experiments. The model
                 is a variant of SPARC Relaxed Memory Order (RMO),
                 structured following the GPU concurrency hierarchy.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'15 conference proceedings.",
}

@Article{Park:2015:CCP,
  author =       "Jason Jong Kyu Park and Yongjun Park and Scott
                 Mahlke",
  title =        "{Chimera}: Collaborative Preemption for Multitasking
                 on a Shared {GPU}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "43",
  number =       "1",
  pages =        "593--606",
  month =        mar,
  year =         "2015",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2786763.2694346",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Jun 3 11:27:38 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "The demand for multitasking on graphics processing
                 units (GPUs) is constantly increasing as they have
                 become one of the default components on modern computer
                 systems along with traditional processors (CPUs).
                 Preemptive multitasking on CPUs has been primarily
                 supported through context switching. However, the same
                 preemption strategy incurs substantial overhead due to
                 the large context in GPUs. The overhead comes in two
                 dimensions: a preempting kernel suffers from a long
                 preemption latency, and the system throughput is wasted
                 during the switch. Without precise control over the
                 large preemption overhead, multitasking on GPUs has
                 little use for applications with strict latency
                 requirements. In this paper, we propose Chimera, a
                 collaborative preemption approach that can precisely
                 control the overhead for multitasking on GPUs. Chimera
                 first introduces streaming multiprocessor (SM)
                 flushing, which can instantly preempt an SM by
                 detecting and exploiting idempotent execution. Chimera
                 utilizes flushing collaboratively with two previously
                 proposed preemption techniques for GPUs, namely context
                 switching and draining to minimize throughput overhead
                 while achieving a required preemption latency.
                 Evaluations show that Chimera violates the deadline for
                 only 0.2\% of preemption requests when a 15us
                 preemption latency constraint is used. For
                 multi-programmed workloads, Chimera can improve the
                 average normalized turnaround time by 5.5x, and system
                 throughput by 12.2\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'15 conference proceedings.",
}

@Article{Agarwal:2015:PPS,
  author =       "Neha Agarwal and David Nellans and Mark Stephenson and
                 Mike O'Connor and Stephen W. Keckler",
  title =        "Page Placement Strategies for {GPUs} within
                 Heterogeneous Memory Systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "43",
  number =       "1",
  pages =        "607--618",
  month =        mar,
  year =         "2015",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2786763.2694381",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Jun 3 11:27:38 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Systems from smartphones to supercomputers are
                 increasingly heterogeneous, being composed of both CPUs
                 and GPUs. To maximize cost and energy efficiency, these
                 systems will increasingly use globally-addressable
                 heterogeneous memory systems, making choices about
                 memory page placement critical to performance. In this
                 work we show that current page placement policies are
                 not sufficient to maximize GPU performance in these
                 heterogeneous memory systems. We propose two new page
                 placement policies that improve GPU performance: one
                 application agnostic and one using application profile
                 information. Our application agnostic policy,
                 bandwidth-aware (BW-AWARE) placement, maximizes GPU
                 throughput by balancing page placement across the
                 memories based on the aggregate memory bandwidth
                 available in a system. Our simulation-based results
                 show that BW-AWARE placement outperforms the existing
                 Linux INTERLEAVE and LOCAL policies by 35\% and 18\% on
                 average for GPU compute workloads. We build upon
                 BW-AWARE placement by developing a compiler-based
                 profiling mechanism that provides programmers with
                 information about GPU application data structure access
                 patterns. Combining this information with simple
                 program-annotated hints about memory placement, our
                 hint-based page placement approach performs within 90\%
                 of oracular page placement on average, largely
                 mitigating the need for costly dynamic page tracking
                 and migration.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'15 conference proceedings.",
}

@Article{Zhao:2015:FPS,
  author =       "Zhijia Zhao and Xipeng Shen",
  title =        "On-the-Fly Principled Speculation for {FSM}
                 Parallelization",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "43",
  number =       "1",
  pages =        "619--630",
  month =        mar,
  year =         "2015",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2786763.2694369",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Jun 3 11:27:38 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Finite State Machine (FSM) is the backbone of an
                 important class of applications in many domains. Its
                 parallelization has been extremely difficult due to
                 inherent strong dependences in the computation.
                 Recently, principled speculation shows good promise to
                 solve the problem. However, the reliance on offline
                 training makes the approach inconvenient to adopt and
                 hard to apply to many practical FSM applications, which
                 often deal with a large variety of inputs different
                 from training inputs. This work presents an assembly of
                 techniques that completely remove the needs for offline
                 training. The techniques include a set of theoretical
                 results on inherent properties of FSMs, and two newly
                 designed dynamic optimizations for efficient FSM
                 characterization. The new techniques, for the first
                 time, make principle speculation applicable on the fly,
                 and enables swift, automatic configuration of
                 speculative parallelizations to best suit a given FSM
                 and its current input. They eliminate the fundamental
                 barrier for practical adoption of principle speculation
                 for FSM parallelization. Experiments show that the new
                 techniques give significantly higher speedups for some
                 difficult FSM applications in the presence of input
                 changes.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'15 conference proceedings.",
}

@Article{David:2015:ACS,
  author =       "Tudor David and Rachid Guerraoui and Vasileios
                 Trigonakis",
  title =        "Asynchronized Concurrency: The Secret to Scaling
                 Concurrent Search Data Structures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "43",
  number =       "1",
  pages =        "631--644",
  month =        mar,
  year =         "2015",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2786763.2694359",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Jun 3 11:27:38 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "We introduce ``asynchronized concurrency (ASCY),'' a
                 paradigm consisting of four complementary programming
                 patterns. ASCY calls for the design of concurrent
                 search data structures (CSDSs) to resemble that of
                 their sequential counterparts. We argue that ASCY leads
                 to implementations which are portably scalable: they
                 scale across different types of hardware platforms,
                 including single and multi-socket ones, for various
                 classes of workloads, such as read-only and read-write,
                 and according to different performance metrics,
                 including throughput, latency, and energy. We
                 substantiate our thesis through the most exhaustive
                 evaluation of CSDSs to date, involving 6 platforms, 22
                 state-of-the-art CSDS algorithms, 10 re-engineered
                 state-of-the-art CSDS algorithms following the ASCY
                 patterns, and 2 new CSDS algorithms designed with ASCY
                 in mind. We observe up to 30\% improvements in
                 throughput in the re-engineered algorithms, while our
                 new algorithms out-perform the state-of-the-art
                 alternatives.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'15 conference proceedings.",
}

@Article{Bhatotia:2015:ITL,
  author =       "Pramod Bhatotia and Pedro Fonseca and Umut A. Acar and
                 Bj{\"o}rn B. Brandenburg and Rodrigo Rodrigues",
  title =        "{iThreads}: a Threading Library for Parallel
                 Incremental Computation",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "43",
  number =       "1",
  pages =        "645--659",
  month =        mar,
  year =         "2015",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2786763.2694371",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Jun 3 11:27:38 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Incremental computation strives for efficient
                 successive runs of applications by re-executing only
                 those parts of the computation that are affected by a
                 given input change instead of recomputing everything
                 from scratch. To realize these benefits automatically,
                 we describe iThreads, a threading library for parallel
                 incremental computation. iThreads supports unmodified
                 shared-memory multithreaded programs: it can be used as
                 a replacement for pthreads by a simple exchange of
                 dynamically linked libraries, without even recompiling
                 the application code. To enable such an interface, we
                 designed algorithms and an implementation to operate at
                 the compiled binary code level by leveraging
                 MMU-assisted memory access tracking and process-based
                 thread isolation. Our evaluation on a multicore
                 platform using applications from the PARSEC and Phoenix
                 benchmarks and two case-studies shows significant
                 performance gains.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'15 conference proceedings.",
}

@Article{Gidra:2015:NGC,
  author =       "Lokesh Gidra and Ga{\"e}l Thomas and Julien Sopena and
                 Marc Shapiro and Nhan Nguyen",
  title =        "{NumaGiC}: a Garbage Collector for Big Data on Big
                 {NUMA} Machines",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "43",
  number =       "1",
  pages =        "661--673",
  month =        mar,
  year =         "2015",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2786763.2694361",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Jun 3 11:27:38 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "On contemporary cache-coherent Non-Uniform Memory
                 Access (ccNUMA) architectures, applications with a
                 large memory footprint suffer from the cost of the
                 garbage collector (GC), because, as the GC scans the
                 reference graph, it makes many remote memory accesses,
                 saturating the interconnect between memory nodes. We
                 address this problem with NumaGiC, a GC with a
                 mostly-distributed design. In order to maximise memory
                 access locality during collection, a GC thread avoids
                 accessing a different memory node, instead notifying a
                 remote GC thread with a message; nonetheless, NumaGiC
                 avoids the drawbacks of a pure distributed design,
                 which tends to decrease parallelism. We compare NumaGiC
                 with Parallel Scavenge and NAPS on two different ccNUMA
                 architectures running on the Hotspot Java Virtual
                 Machine of OpenJDK 7. On Spark and Neo4j, two
                 industry-strength analytics applications, with heap
                 sizes ranging from 160GB to 350GB, and on SPECjbb2013
                 and SPECjbb2005, ourgc improves overall performance by
                 up to 45\% over NAPS (up to 94\% over Parallel
                 Scavenge), and increases the performance of the
                 collector itself by up to 3.6x over NAPS (up to 5.4x
                 over Parallel Scavenge).",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'15 conference proceedings.",
}

@Article{Nguyen:2015:FCR,
  author =       "Khanh Nguyen and Kai Wang and Yingyi Bu and Lu Fang
                 and Jianfei Hu and Guoqing Xu",
  title =        "{FACADE}: a Compiler and Runtime for (Almost)
                 Object-Bounded Big Data Applications",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "43",
  number =       "1",
  pages =        "675--690",
  month =        mar,
  year =         "2015",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2786763.2694345",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Jun 3 11:27:38 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "The past decade has witnessed the increasing demands
                 on data-driven business intelligence that led to the
                 proliferation of data-intensive applications. A managed
                 object-oriented programming language such as Java is
                 often the developer's choice for implementing such
                 applications, due to its quick development cycle and
                 rich community resource. While the use of such
                 languages makes programming easier, their automated
                 memory management comes at a cost. When the managed
                 runtime meets Big Data, this cost is significantly
                 magnified and becomes a scalability-prohibiting
                 bottleneck. This paper presents a novel compiler
                 framework, called Facade, that can generate
                 highly-efficient data manipulation code by
                 automatically transforming the data path of an existing
                 Big Data application. The key treatment is that in the
                 generated code, the number of runtime heap objects
                 created for data types in each thread is (almost)
                 statically bounded, leading to significantly reduced
                 memory management cost and improved scalability. We
                 have implemented Facade and used it to transform 7
                 common applications on 3 real-world, already
                 well-optimized Big Data frameworks: GraphChi, Hyracks,
                 and GPS. Our experimental results are very positive:
                 the generated programs have (1) achieved a 3\%--48\%
                 execution time reduction and an up to 88X GC reduction;
                 (2) consumed up to 50\% less memory, and (3) scaled to
                 much larger datasets.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'15 conference proceedings.",
}

@Article{Agrawal:2015:ASD,
  author =       "Varun Agrawal and Abhiroop Dabral and Tapti Palit and
                 Yongming Shen and Michael Ferdman",
  title =        "Architectural Support for Dynamic Linking",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "43",
  number =       "1",
  pages =        "691--702",
  month =        mar,
  year =         "2015",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2786763.2694392",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Wed Jun 3 11:27:38 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "All software in use today relies on libraries,
                 including standard libraries (e.g., C, C++) and
                 application-specific libraries (e.g., libxml, libpng).
                 Most libraries are loaded in memory and dynamically
                 linked when programs are launched, resolving symbol
                 addresses across the applications and libraries.
                 Dynamic linking has many benefits: It allows code to be
                 reused between applications, conserves memory (because
                 only one copy of a library is kept in memory for all
                 the applications that share it), and allows libraries
                 to be patched and updated without modifying programs,
                 among numerous other benefits. However, these benefits
                 come at the cost of performance. For every call made to
                 a function in a dynamically linked library, a
                 trampoline is used to read the function address from a
                 lookup table and branch to the function, incurring
                 memory load and branch operations. Static linking
                 avoids this performance penalty, but loses all the
                 benefits of dynamic linking. Given its myriad benefits,
                 dynamic linking is the predominant choice today,
                 despite the performance cost. In this work, we propose
                 a speculative hardware mechanism to optimize dynamic
                 linking by avoiding executing the trampolines for
                 library function calls, providing the benefits of
                 dynamic linking with the performance of static linking.
                 Speculatively skipping the memory load and branch
                 operations of the library call trampolines improves
                 performance by reducing the number of executed
                 instructions and gains additional performance by
                 reducing pressure on the instruction and data caches,
                 TLBs, and branch predictors. Because the indirect
                 targets of library call trampolines do not change
                 during program execution, our speculative mechanism
                 never misspeculates in practice. We evaluate our
                 technique on real hardware with production software and
                 observe up to 4\% speedup using only 1.5KB of on-chip
                 storage.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'15 conference proceedings.",
}

@Article{Chien:2015:CSH,
  author =       "Andrew A. Chien and Tung Thanh-Hoang and Dilip
                 Vasudevan and Yuanwei Fang and Amirali Shambayati",
  title =        "$ 10 \times 10 $: a Case Study in Highly-Programmable
                 and Energy-Efficient Heterogeneous Federated
                 Architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "43",
  number =       "3",
  pages =        "2--9",
  month =        may,
  year =         "2015",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2856113.2856115",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Dec 21 18:10:56 MST 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Customized architecture is widely recognized as an
                 important approach for improved performance and energy
                 efficiency. To balance generality and customization
                 benefit, researchers have proposed to federate
                 heterogeneous micro-engines. Using the $ 10 \times 10 $
                 architecture and an integrated image and vision
                 benchmark as a case study, we explore the performance
                 and energy benefits achievable. Results for current
                 32nm technology and DDR3 memory show $ 10 \times 10 $
                 architecture benefits of 140$ \times $ performance and
                 72$ \times $ energy overall. Adding 3D-stacked DRAM
                 increase benefits to 171$ \times $ (performance) and
                 100$ \times $ (energy). Finally, considering future 7nm
                 transistor process, benefits as large as 597$ \times $
                 (performance) and 137$ \times $ energy are observed.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Thorson:2015:INa,
  author =       "Mark Thorson",
  title =        "{Internet} Nuggets",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "43",
  number =       "3",
  pages =        "10--16",
  month =        may,
  year =         "2015",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2856113.2856117",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Dec 21 18:10:56 MST 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Herbordt:2015:LLG,
  author =       "Martin Herbordt and Miriam Leeser",
  title =        "Off-Loading {LET} Generation to {PEACH2}: a Switching
                 Hub for High Performance {GPU} Clusters",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "43",
  number =       "4",
  pages =        "3--8",
  month =        sep,
  year =         "2015",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2927964.2927966",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Apr 22 17:03:53 MDT 2016",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/bibnet/subjects/fastmultipole.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "A hardware local essential tree (LET) generator used
                 in an N-body simulation is implemented on the FPGA of
                 PEACH2 (PCI Express Adaptive Communication Hub ver2), a
                 low latency switching hub for high performance GPU
                 clusters. By using the pipelined on-the-fly execution
                 with a multipole acceptance criterion judging module
                 and a data updating module, the generation performance
                 is 2.2 times faster than that with the CPU. When data
                 communication is considered, the performance was 7.2
                 times as the case with the CPU.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "HEART '15 conference proceedings.",
}

@Article{Okina:2015:PPP,
  author =       "Koji Okina and Rie Soejima and Kota Fukumoto and
                 Yuichiro Shibata and Kiyoshi Oguri",
  title =        "Power Performance Profiling of {$3$-D} Stencil
                 Computation on an {FPGA} Accelerator for Efficient
                 Pipeline Optimization",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "43",
  number =       "4",
  pages =        "9--14",
  month =        sep,
  year =         "2015",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2927964.2927967",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Apr 22 17:03:53 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "This paper discusses power-performance optimization
                 for 3-D stencil computing on a stream-oriented FPGA
                 accelerator with high-level synthesis. Taking a heat
                 conduction simulation and an FDTD electromagnetic field
                 simulation as benchmark applications, power-performance
                 profiling results are presented focusing on the effect
                 of high-level pipeline parameters. As a result, it is
                 shown that the optimal power efficiency can be achieved
                 basically by optimizing the execution performance. The
                 relationship between power efficiency and the clock
                 frequency is also discussed.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "HEART '15 conference proceedings.",
}

@Article{Lashgar:2015:CSR,
  author =       "Ahmad Lashgar and Ebad Salehi and Amirali Baniasadi",
  title =        "A Case Study in Reverse Engineering {GPGPUs}:
                 Outstanding Memory Handling Resources",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "43",
  number =       "4",
  pages =        "15--21",
  month =        sep,
  year =         "2015",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2927964.2927968",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Apr 22 17:03:53 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "During recent years, GPU micro-architectures have
                 changed dramatically, evolving into powerful many-core
                 deep-multithreaded platforms for parallel workloads.
                 While important micro-architectural modifications
                 continue to appear in every new generation of these
                 processors, unfortunately, little is known about the
                 details of these innovative designs. One of the key
                 questions in understanding GPUs is how they deal with
                 outstanding memory misses. Our goal in this study is to
                 find answers to this question. To this end, we develop
                 a set of micro-benchmarks in CUDA to understand the
                 outstanding memory requests handling resources.
                 Particularly, we study two NVIDIA GPGPUs (Fermi and
                 Kepler) and estimate their capability in handling
                 outstanding memory requests. We show that Kepler can
                 issue nearly 32X higher number of outstanding memory
                 requests, compared to Fermi. We explain this
                 enhancement by Kepler's architectural modifications in
                 outstanding memory request handling resources.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "HEART '15 conference proceedings.",
}

@Article{Hayashi:2015:LRO,
  author =       "Ami Hayashi and Yuta Tokusashi and Hiroki Matsutani",
  title =        "A Line Rate Outlier Filtering {FPGA NIC} using {10GbE}
                 Interface",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "43",
  number =       "4",
  pages =        "22--27",
  month =        sep,
  year =         "2015",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2927964.2927969",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Apr 22 17:03:53 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "As data sets grow rapidly in size and the number, an
                 outlier detection that filters unnecessary normal
                 information becomes important. In this paper, we
                 propose to move the unsupervised outlier detection from
                 an application layer to a network interface card (NIC).
                 Only anomalous items or events are received for a
                 network protocol stack and the other packets are
                 discarded at the NIC. The demands for storage and
                 computation costs at a host are thus dramatically
                 reduced. However, because normal items are discarded at
                 the NIC and the application layer can no longer know
                 what is normal, in our approach, the application at the
                 host periodically peeks at the NIC buffer. We select an
                 outlier detection based on the Mahalanobis distance as
                 one of the simplest algorithms. Our approach is
                 implemented on an FPGA-based NIC that has 10GbE
                 interfaces. The sampling frequency of the NIC buffer
                 vs. outlier detection precision is analyzed. Real
                 experiments using the FPGA NIC demonstrate a 14,000,000
                 samples-per-second throughput in performance, which is
                 close to the 10GbE line rate.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "HEART '15 conference proceedings.",
}

@Article{Jain:2015:ADA,
  author =       "Abhishek Kumar Jain and Xiangwei Li and Suhaib A.
                 Fahmy and Douglas L. Maskell",
  title =        "Adapting the {DySER} Architecture with {DSP} Blocks as
                 an Overlay for the {Xilinx Zynq}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "43",
  number =       "4",
  pages =        "28--33",
  month =        sep,
  year =         "2015",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2927964.2927970",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Apr 22 17:03:53 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Coarse-grained overlay architectures have been shown
                 to be effective when paired with general purpose
                 processors, offering software-like programmability,
                 fast compilation, and improved design productivity.
                 These architectures enable general purpose hardware
                 accelerators, allowing hardware design at a higher
                 level of abstraction, but at the cost of area and
                 performance overheads. This paper examines the DySER
                 overlay architecture as a hardware accelerator paired
                 with a general purpose processor in a hybrid FPGA such
                 as the Xilinx Zynq. We evaluate the DySER architecture
                 mapped on the Xilinx Zynq and show that it suffers from
                 a significant area and performance overhead. We then
                 propose an improved functional unit architecture using
                 the flexibility of the DSP48E1 primitive which results
                 in a 2.5 times frequency improvement and 25\% area
                 reduction compared to the original functional unit
                 architecture. We demonstrate that this improvement
                 results in the routing architecture becoming the
                 bottleneck in performance.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "HEART '15 conference proceedings.",
}

@Article{delaChevallerie:2015:FLH,
  author =       "David de la Chevallerie and Jens Korinth and Andreas
                 Koch",
  title =        "{ffLink}: a Lightweight High-Performance Open-Source
                 {PCI Express Gen3} Interface for Reconfigurable
                 Accelerators",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "43",
  number =       "4",
  pages =        "34--39",
  month =        sep,
  year =         "2015",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2927964.2927971",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Apr 22 17:03:53 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "We describe the architecture and implementation of
                 ffLink, a high-performance PCIe Gen3 interface for
                 attaching reconfigurable accelerators on Xilinx Virtex
                 7 FPGA devices to Linux-based hosts. ffLink encompasses
                 both hardware as well as flexible operating system
                 components that allow a tailoring of the infrastructure
                 to the specific data transfer needs of the application.
                 When configured to use multiple DMA engines to hide
                 transfer latencies, ffLink achieves a throughput of up
                 to 7 GB/s, which is 95\% of the maximum throughput of
                 an eight-lane PCIe interface, while requiring just 11\%
                 of device area on a mid-size FPGA.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "HEART '15 conference proceedings.",
}

@Article{Hmid:2015:TAR,
  author =       "Soukaina N. Hmid and Jose G. F. Coutinho and Wayne
                 Luk",
  title =        "A Transfer-Aware Runtime System for Heterogeneous
                 Asynchronous Parallel Execution",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "43",
  number =       "4",
  pages =        "40--45",
  month =        sep,
  year =         "2015",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2927964.2927972",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Apr 22 17:03:53 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "This paper presents a novel resource management
                 approach for efficiently managing the computation and
                 the data movements between the host and its
                 accelerators in a heterogeneous platform. Our approach
                 is based on OmpSs, with support for multi-core CPUs,
                 GPGPUs and Maxeler Data Flow Engines based on FPGA
                 technology; it exploits data locality, data transfer
                 costs and data dependencies. The proposed approach is
                 supported by an offline learning process coupled with
                 online monitoring, allowing performance to be estimated
                 while learning from past observations during execution.
                 Its performance is compared against the current OmpSs
                 scheduler using five benchmarks: matrix multiplication,
                 bitonic sort, N-body simulation, Cholesky decomposition
                 and AdPredictor. The results show the proposed approach
                 can achieve up to 4.25 times speed-up for Cholesky
                 decomposition. Moreover, an evaluation with AdPredictor
                 indicates that the FPGA version is up to 46 times
                 faster than the CPU version for large task sizes.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "HEART '15 conference proceedings.",
}

@Article{Al-Wattar:2015:EMA,
  author =       "Ahmed Al-Wattar and Shawki Areibi and Gary Grewal",
  title =        "Efficient Mapping and Allocation of Execution Units to
                 Task Graphs using an Evolutionary Framework",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "43",
  number =       "4",
  pages =        "46--51",
  month =        sep,
  year =         "2015",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2927964.2927973",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Apr 22 17:03:53 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Partial dynamic reconfiguration of FPGAs gives
                 designers the capability to change certain parts of the
                 hardware while other parts remain active and in use.
                 This provides several benefits including reducing
                 device count and power consumption. However, this also
                 introduces new challenges that need to be addressed by
                 designers. This paper introduces a framework for
                 efficient mapping of execution units to task graphs in
                 a runtime reconfigurable system. The framework utilizes
                 an Island Based Genetic Algorithm flow that optimizes
                 several objectives including delay and power
                 consumption. The GA based technique not only optimizes
                 the above objectives, but also aggregates the Pareto
                 front of the different islands to further enhance
                 solution quality. The Island based GA runs each GA in
                 parallel, and is amenable to both software and hardware
                 implementation. The proposed Island based GA framework
                 achieves on average 55.2\% improvement over a single GA
                 implementation and 80.7\% improvement over a baseline
                 random allocation and binding approach.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "HEART '15 conference proceedings.",
}

@Article{Momeni:2015:EEO,
  author =       "Amir Momeni and Hamed Tabkhi and Yash Ukidave and
                 Gunar Schirner and David Kaeli",
  title =        "Exploring the Efficiency of the {OpenCL} Pipe Semantic
                 on an {FPGA}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "43",
  number =       "4",
  pages =        "52--57",
  month =        sep,
  year =         "2015",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2927964.2927974",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Apr 22 17:03:53 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "This paper evaluates the potential benefits of
                 leveraging the OpenCL Pipe semantic to accelerate
                 FPGA-based applications. Our work focuses on streaming
                 applications in the embedded vision processing domain.
                 These applications are well-suited for concurrent
                 kernel execution support and inter-kernel communication
                 enabled by using OpenCL pipes. We analyze the impact of
                 multiple design factors and application optimizations
                 to improve the performance offered by OpenCL Pipes. The
                 design tradeoffs considered include: the execution
                 granularity across kernels, the rate and volume of data
                 transfers, and the Pipe size. For our case study
                 application of vision ow, we observe a 2.8X increase in
                 throughput for tuned pipelined kernels, as compared to
                 non-pipelined execution. In addition, we propose a
                 novel mechanism to efficiently capture the behavior for
                 2-dimensional (2D) vision algorithms to benefit
                 Pipe-based execution.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "HEART '15 conference proceedings.",
}

@Article{Mitsuishi:2015:BFS,
  author =       "Takuji Mitsuishi and Jun Suzuki and Yuki Hayashi and
                 Masaki Kan and Hideharu Amano",
  title =        "Breadth First Search on Cost-efficient Multi-{GPU}
                 Systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "43",
  number =       "4",
  pages =        "58--63",
  month =        sep,
  year =         "2015",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2927964.2927975",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Apr 22 17:03:53 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "A parallel Breadth First Search (BFS) algorithm is
                 proposed for cost-efficient multi-GPU systems without
                 enough memory amount or communication performance. By
                 using an improved data structure for the duplication
                 elimination of local nodes, both required memory amount
                 and processing time are reduced. By using Unified
                 Virtual Addressing, time for communication can be
                 hidden with the computation. The proposed algorithm is
                 implemented on two cost-efficient multi-GPU systems:
                 Express multi-GPU system which has a full of
                 flexibility but the communication latency between GPU
                 and host is limited, and a high-end gaming machine
                 whose memory is limited. Both systems achieve good
                 strong scaling with the proposed methods. On Express
                 multi-GPU system, the communication overhead was almost
                 completely hidden, and the aggregate communication
                 throughput reached 4.77 GB/sec (38.16 Gbps), almost
                 theoretical maximum.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "HEART '15 conference proceedings.",
}

@Article{Mefenza:2015:IBM,
  author =       "Michael Mefenza and Nicolas Edwards and Christophe
                 Bobda",
  title =        "Interface Based Memory Synthesis of Image Processing
                 Applications in {FPGA}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "43",
  number =       "4",
  pages =        "64--69",
  month =        sep,
  year =         "2015",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2927964.2927976",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Apr 22 17:03:53 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Image processing applications are computationally
                 intensive and data intensive and rely on memory
                 elements (buffer, window, line buffer, shift register,
                 and frame buffer) to store data flow dependencies
                 between computing components in FPGA. Due to the
                 limited availability of these resources, optimization
                 of memory allocation and the implementation of
                 efficient memory architectures are important issues. We
                 present an interface, the Component Interconnect and
                 Data Access (CIDA), and its implementation, based on
                 interface automata formalism. We used that interface
                 for modeling image processing applications and
                 generating common memory elements. Based on the
                 proposed model and information about the FPGA
                 architecture, we also present an optimization model to
                 achieve allocation memory requirements to embedded
                 memories (Block RAM and Distributed RAM). Allocation
                 results from realistic video systems on Xilinx Zynq
                 FPGAs verify the correctness of the model and show that
                 the proposed approach achieves appreciable reduction in
                 block RAM usage.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "HEART '15 conference proceedings.",
}

@Article{Tong:2015:HTS,
  author =       "Da Tong and Viktor Prasanna",
  title =        "High Throughput Sketch Based Online Heavy Hitter
                 Detection on {FPGA}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "43",
  number =       "4",
  pages =        "70--75",
  month =        sep,
  year =         "2015",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2927964.2927977",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Apr 22 17:03:53 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "In the context of networking, a heavy hitter is an
                 entity in a data stream whose amount of activity (such
                 as bandwidth consumption or number of connections) is
                 higher than a given threshold. Detecting heavy hitters
                 is a critical task for network management and security
                 in the Internet and data centers. Data streams in
                 modern network usually contain millions of entities,
                 such as traffic flows or IP domains. It is challenging
                 to detect heavy hitters at a high throughput while
                 supporting such a large number of entities. I this
                 work, we propose a high throughput online heavy hitter
                 detector based on the Count-min sketch algorithm on
                 FPGA. We propose a high throughput hash computation
                 architecture, optimize the Count-min sketch for
                 hardware-based heavy hitter detection and use
                 forwarding to deal with data hazards. The post
                 place-and-route results of our architecture on a
                 state-of-the-art FPGA shows high throughput and
                 scalability. Our architecture achieves a throughput of
                 114 Gbps while supporting a typical 1 M concurrent
                 entities. It sustains 100+ Gbps throughput while
                 supporting various number of concurrent entities,
                 stream sizes and accuracy requirements. Our
                 implementation demonstrates improved performance
                 compared with other sketch acceleration techniques on
                 various platforms using similar sketch
                 configurations.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "HEART '15 conference proceedings.",
}

@Article{Wang:2015:CAS,
  author =       "Xinying Wang and Phillip H. Jones and Joseph
                 Zambreno",
  title =        "A Configurable Architecture for Sparse {$ L U $}
                 Decomposition on Matrices with Arbitrary Patterns",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "43",
  number =       "4",
  pages =        "76--81",
  month =        sep,
  year =         "2015",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2927964.2927978",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Apr 22 17:03:53 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Sparse LU decomposition has been widely used to solve
                 sparse linear systems of equations found in many
                 scientific and engineering applications, such as
                 circuit simulation, power system modeling and computer
                 vision. However, it is considered a computationally
                 expensive factorization tool. While parallel
                 implementations have been explored to accelerate sparse
                 LU decomposition, irregular sparsity patterns often
                 limit their performance gains. Prior FPGA-based
                 accelerators have been customized to domain-specific
                 sparsity patterns of pre-ordered symmetric matrices. In
                 this paper, we present an efficient architecture for
                 sparse LU decomposition that supports both symmetric
                 and asymmetric sparse matrices with arbitrary sparsity
                 patterns. The control structure of our architecture
                 parallelizes computation and pivoting operations. Also,
                 on-chip resource utilization is configured based on
                 properties of the matrices being processed. Our
                 experimental results show a 1:6 to 14x speedup over an
                 optimized software implementation for benchmarks
                 containing a wide range of sparsity patterns.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "HEART '15 conference proceedings.",
}

@Article{Sano:2015:SCS,
  author =       "Kentaro Sano and Fumiya Kono and Naohito Nakasato and
                 Alexander Vazhenin and Stanislav Sedukhin",
  title =        "Stream Computation of Shallow Water Equation Solver
                 for {FPGA}-based {$1$D} Tsunami Simulation",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "43",
  number =       "4",
  pages =        "82--87",
  month =        sep,
  year =         "2015",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2927964.2927979",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Apr 22 17:03:53 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "MOST (Method Of Splitting Tsunami) is widely used to
                 solve shallow water equations (SWEs) for forecasting
                 tsunami generated by an earthquake. Toward development
                 of a power-efficient and high-performance computing
                 system for 2D tsunami simulation, we conduct
                 feasibility study on stream computation of 1D SWE
                 solver with FPGA.We analyze an original code and design
                 a stream algorithm with techniques of kernel fusion,
                 shift buffering for streamed stencil-data access, and
                 cascading processing elements for a longer pipeline. We
                 implement a deep pipeline with at most 744 stages of 4
                 SPEs on 28 nm Stratix V FPGA, which achieves 82.4
                 GFlop/s at 200 MHz.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "HEART '15 conference proceedings.",
}

@Article{Guo:2015:PGA,
  author =       "Liucheng Guo and Andreea Ingrid Funie and David B.
                 Thomas and Haohuan Fu and Wayne Luk",
  title =        "Parallel Genetic Algorithms on Multiple {FPGAs}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "43",
  number =       "4",
  pages =        "86--93",
  month =        sep,
  year =         "2015",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2927964.2927980",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Apr 22 17:03:53 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Genetic algorithms (GA) have been shown to be
                 effective in the optimization of many large-scale
                 real-world problems in a reasonable amount of time.
                 Parallel GAs not only reduce the overall GA execution
                 time, but also bring higher quality solutions due to
                 parallel search in multiple parts of the solution
                 space. This paper proposes a parallel GA system on
                 hardware such as Field-Programmable-Gate-Arrays
                 (FPGAs). Our approach targets multiple FPGAs by
                 exploring different search areas of the same solution
                 space with different behaviours. Each FPGA contains an
                 optimised customisable GA which can be configured using
                 run-time parameters, removing the need for expensive
                 recompilation. This paper also explores adjustment of
                 the migration gap, providing empirical guidance on good
                 settings to users. Experiments on three problems show
                 the high performance of our system, with a 30 times
                 speedup achieved compared to a multi-core CPU-based
                 implementation.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "HEART '15 conference proceedings.",
}

@Article{Thorson:2015:INb,
  author =       "Mark Thorson",
  title =        "{Internet} Nuggets",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "43",
  number =       "4",
  pages =        "94--100",
  month =        sep,
  year =         "2015",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2927964.2927982",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Apr 22 17:03:53 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "HEART '15 conference proceedings.",
}

@Article{Thorson:2015:INc,
  author =       "Mark Thorson",
  title =        "{Internet} Nuggets",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "43",
  number =       "5",
  pages =        "7--11",
  month =        dec,
  year =         "2015",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2964792.2964794",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jul 12 16:17:49 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Asgharimoghaddam:2016:SPE,
  author =       "Hadi Asgharimoghaddam and Nam Sung Kim",
  title =        "{SpinWise}: a Practical Energy-Efficient
                 Synchronization Technique for {CMPs}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "1",
  pages =        "1--8",
  month =        may,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2971331.2971333",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jul 12 16:17:49 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Spinning had been the classical way of implementing
                 synchronization primitives (i.e., barriers, locks and
                 conditions) in pthread library before the adoption of
                 fast user space mutex (futex). Since spinning cores do
                 not perform any useful work, it has been believed that
                 futex is more energy efficient than spinning. In this
                 paper, using commercial chip multi-processors (CMPs),
                 first we provide deep insights on how the commercial
                 CMP and operating system together reduce power
                 consumption during spinning- and futex-based
                 synchronization and analyze the duration of
                 synchronization cycles for each implementation. Second,
                 we analyze limitations of existing techniques that
                 attempt to reduce power consumption of CMPs during
                 synchronization. Finally, we propose a spinning-based
                 energy-efficient synchronization technique dubbed
                 SpinWise. We demonstrate that SpinWise can provide 22\%
                 higher geometric mean energy efficiency than futex for
                 a CMP running applications with many frequent and short
                 synchronization events.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Olson:2016:PDW,
  author =       "Lena E. Olson and Mark D. Hill",
  title =        "Probabilistic Directed Writebacks for Exclusive
                 Caches",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "1",
  pages =        "9--18",
  month =        may,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2971331.2971334",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jul 12 16:17:49 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Energy is an increasingly important consideration in
                 memory system design. Caches improve energy efficiency
                 by decreasing execution time and reducing the number of
                 main memory accesses, but they suffer from known
                 inefficiencies: the last-level cache (LLC) tends to
                 have a high miss ratio while simultaneously storing
                 many blocks that are never referenced. Because these
                 blocks are not referenced before eviction, we can write
                 them directly to memory rather than to the LLC. To do
                 so, we must predict which blocks will not be
                 referenced. Previous approaches rely on additional
                 state at the LLC and/or extra communication. We show
                 that by predicting working set size per program counter
                 (PC), we can decide which blocks have low probability
                 of being referenced. Our approach relies on the insight
                 that it is possible to makes this prediction based
                 solely on the address stream as seen by the level-one
                 data cache (L1D), eliminating the need to store or
                 communicate PC values between levels of the cache
                 hierarchy. We require no modifications to the LLC. Our
                 approach uses Flajolet and Martin's probabilistic
                 counting to keep the state small: two additional bits
                 per L1D block, with an additional 6KB prediction table.
                 This approach yields a large reduction in number of LLC
                 writebacks: 25\% fewer for SPEC on average, 80\% fewer
                 for graph500, and 67\% fewer for an in-memory hash
                 table.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Thorson:2016:INa,
  author =       "Mark Thorson",
  title =        "{Internet} Nuggets",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "1",
  pages =        "19--22",
  month =        may,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2971331.2971336",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Tue Jul 12 16:17:49 MDT 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Zhou:2016:PUH,
  author =       "Yuanyuan Zhou",
  title =        "Programming Uncertain {$<$T$>$ hings}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "2",
  pages =        "1--2",
  month =        may,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2980024.2872416",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:42 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/fparith.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Innovation flourishes with good abstractions. For
                 instance, codification of the IEEE Floating Point
                 standard in 1985 was critical to the subsequent success
                 of scientific computing. Programming languages
                 currently lack appropriate abstractions for uncertain
                 data. Applications already use estimates from sensors,
                 machine learning, big data, humans, and approximate
                 algorithms, but most programming languages do not help
                 developers address correctness, programmability, and
                 optimization problems due to estimates. To address
                 these problems, we propose a new programming
                 abstraction called Uncertain. We encourage the
                 community to develop and use abstractions for
                 estimates.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'16 conference proceedings.",
}

@Article{Abadal:2016:WAF,
  author =       "Sergi Abadal and Albert Cabellos-Aparicio and Eduard
                 Alarcon and Josep Torrellas",
  title =        "{WiSync}: an Architecture for Fast Synchronization
                 through On-Chip Wireless Communication",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "2",
  pages =        "3--17",
  month =        may,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2980024.2872396",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:42 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "In shared-memory multiprocessing, fine-grain
                 synchronization is challenging because it requires
                 frequent communication. As technology scaling delivers
                 larger manycore chips, such pattern is expected to
                 remain costly to support. In this paper, we propose to
                 address this challenge by using on-chip wireless
                 communication. Each core has a transceiver and an
                 antenna to communicate with all the other cores. This
                 environment supports very low latency global
                 communication. Our architecture, called WiSync, uses a
                 per-core Broadcast Memory (BM). When a core writes to
                 its BM, all the other 100+ BMs get updated in less than
                 10 processor cycles. We also use a second wireless
                 channel with cheaper transfers to execute barriers
                 efficiently. WiSync supports multiprogramming, virtual
                 memory, and context switching. Our evaluation with
                 simulations of 128-threaded kernels and 64-threaded
                 applications shows that WiSync speeds-up
                 synchronization substantially. Compared to using
                 advanced conventional synchronization, WiSync attains
                 an average speedup of nearly one order of magnitude for
                 the kernels, and 1.12 for PARSEC and SPLASH-2.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'16 conference proceedings.",
}

@Article{Wang:2016:RTE,
  author =       "Xiaodong Wang and Jos{\'e} F. Mart{\'\i}nez",
  title =        "{ReBudget}: Trading Off Efficiency vs. Fairness in
                 Market-Based Multicore Resource Allocation via Runtime
                 Budget Reassignment",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "2",
  pages =        "19--32",
  month =        may,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2980024.2872382",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:42 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Efficiently allocating shared resources in computer
                 systems is critical to optimizing execution. Recently,
                 a number of market-based solutions have been proposed
                 to attack this problem. Some of them provide provable
                 theoretical bounds to efficiency and/or fairness losses
                 under market equilibrium. However, they are limited to
                 markets with potentially important constraints, such as
                 enforcing equal budget for all players, or
                 curve-fitting players' utility into a specific function
                 type. Moreover, they do not generally provide an
                 intuitive ``knob'' to control efficiency vs. fairness.
                 In this paper, we introduce two new metrics, Market
                 Utility Range (MUR) and Market Budget Range (MBR),
                 through which we provide for the first time theoretical
                 bounds on efficiency and fairness of market equilibria
                 under arbitrary budget assignments. We leverage this
                 result and propose ReBudget, an iterative budget
                 re-assignment algorithm that can be used to control
                 efficiency vs. fairness at run-time. We apply our
                 algorithm to a multi-resource allocation problem in
                 multicore chips. Our evaluation using detailed
                 execution-driven simulations shows that our budget
                 re-assignment technique is intuitive, effective, and
                 efficient.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'16 conference proceedings.",
}

@Article{Zhu:2016:DEQ,
  author =       "Haishan Zhu and Mattan Erez",
  title =        "{Dirigent}: Enforcing {QoS} for Latency-Critical Tasks
                 on Shared Multicore Systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "2",
  pages =        "33--47",
  month =        may,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2980024.2872394",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:42 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Latency-critical applications suffer from both average
                 performance degradation and reduced completion time
                 predictability when collocated with batch tasks. Such
                 variation forces the system to overprovision resources
                 to ensure Quality of Service (QoS) for latency-critical
                 tasks, degrading overall system throughput. We explore
                 the causes of this variation and exploit the
                 opportunities of mitigating variation directly to
                 simultaneously improve both QoS and utilization. We
                 develop, implement, and evaluate Dirigent, a
                 lightweight performance-management runtime system that
                 accurately controls the QoS of latency-critical
                 applications at fine time scales, leveraging existing
                 architecture mechanisms. We evaluate Dirigent on a real
                 machine and show that it is significantly more
                 effective than configurations representative of prior
                 schemes.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'16 conference proceedings.",
}

@Article{Kuperman:2016:PR,
  author =       "Yossi Kuperman and Eyal Moscovici and Joel Nider and
                 Razya Ladelsky and Abel Gordon and Dan Tsafrir",
  title =        "Paravirtual Remote {I/O}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "2",
  pages =        "49--65",
  month =        may,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2980024.2872378",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:42 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "The traditional ``trap and emulate'' I/O
                 paravirtualization model conveniently allows for I/O
                 interposition, yet it inherently incurs costly
                 guest-host context switches. The newer ``sidecore''
                 model eliminates this overhead by dedicating host
                 (side)cores to poll the relevant guest memory regions
                 and react accordingly without context switching. But
                 the dedication of sidecores on each host might be
                 wasteful when I/O activity is low, or it might not
                 provide enough computational power when I/O activity is
                 high. We propose to alleviate this problem at rack
                 scale by consolidating the dedicated sidecores spread
                 across several hosts onto one server. The hypervisor is
                 then effectively split into two parts: the local
                 hypervisor that hosts the VMs, and the remote
                 hypervisor that processes their paravirtual I/O. We
                 call this model vRIO---paraVirtual Remote I/O. We find
                 that by increasing the latency somewhat, it provides
                 comparable throughput with fewer sidecores and superior
                 throughput with the same number of sidecores as
                 compared to the state of the art. vRIO additionally
                 constitutes a new, cost-effective way to consolidate
                 I/O devices (on the remote hypervisor) while supporting
                 efficient programmable I/O interposition.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'16 conference proceedings.",
}

@Article{Kaufmann:2016:HPP,
  author =       "Antoine Kaufmann and SImon Peter and Naveen Kr. Sharma
                 and Thomas Anderson and Arvind Krishnamurthy",
  title =        "High Performance Packet Processing with {FlexNIC}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "2",
  pages =        "67--81",
  month =        may,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2980024.2872367",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:42 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "The recent surge of network I/O performance has put
                 enormous pressure on memory and software I/O processing
                 sub systems. We argue that the primary reason for high
                 memory and processing overheads is the inefficient use
                 of these resources by current commodity network
                 interface cards (NICs). We propose FlexNIC, a flexible
                 network DMA interface that can be used by operating
                 systems and applications alike to reduce packet
                 processing overheads. FlexNIC allows services to
                 install packet processing rules into the NIC, which
                 then executes simple operations on packets while
                 exchanging them with host memory. Thus, our proposal
                 moves some of the packet processing traditionally done
                 in software to the NIC, where it can be done flexibly
                 and at high speed. We quantify the potential benefits
                 of FlexNIC by emulating the proposed FlexNIC
                 functionality with existing hardware or in software. We
                 show that significant gains in application performance
                 are possible, in terms of both latency and throughput,
                 for several widely used applications, including a
                 key-value store, a stream processing system, and an
                 intrusion detection system.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'16 conference proceedings.",
}

@Article{Bornholt:2016:SCF,
  author =       "James Bornholt and Antoine Kaufmann and Jialin Li and
                 Arvind Krishnamurthy and Emina Torlak and Xi Wang",
  title =        "Specifying and Checking File System Crash-Consistency
                 Models",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "2",
  pages =        "83--98",
  month =        may,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2980024.2872406",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:42 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Applications depend on persistent storage to recover
                 state after system crashes. But the POSIX file system
                 interfaces do not define the possible outcomes of a
                 crash. As a result, it is difficult for application
                 writers to correctly understand the ordering of and
                 dependencies between file system operations, which can
                 lead to corrupt application state and, in the worst
                 case, catastrophic data loss. This paper presents
                 crash-consistency models, analogous to memory
                 consistency models, which describe the behavior of a
                 file system across crashes. Crash-consistency models
                 include both litmus tests, which demonstrate allowed
                 and forbidden behaviors, and axiomatic and operational
                 specifications. We present a formal framework for
                 developing crash-consistency models, and a toolkit,
                 called Ferrite, for validating those models against
                 real file system implementations. We develop a
                 crash-consistency model for ext4, and use Ferrite to
                 demonstrate unintuitive crash behaviors of the ext4
                 implementation. To demonstrate the utility of
                 crash-consistency models to application writers, we use
                 our models to prototype proof-of-concept verification
                 and synthesis tools, as well as new library interfaces
                 for crash-safe applications.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'16 conference proceedings.",
}

@Article{Prasad:2016:PMR,
  author =       "Aravinda Prasad and K. Gopinath",
  title =        "Prudent Memory Reclamation in Procrastination-Based
                 Synchronization",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "2",
  pages =        "99--112",
  month =        may,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2980024.2872405",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:42 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Procrastination is the fundamental technique used in
                 synchronization mechanisms such as Read-Copy-Update
                 (RCU) where writers, in order to synchronize with
                 readers, defer the freeing of an object until there are
                 no readers referring to the object. The synchronization
                 mechanism determines when the deferred object is safe
                 to reclaim and when it is actually reclaimed. Hence,
                 such memory reclamations are completely oblivious of
                 the memory allocator state. This induces poor memory
                 allocator performance, for instance, when the
                 reclamations are ill-timed. Furthermore, deferred
                 objects provide hints about the future that inform
                 memory regions that are about to be freed. Although
                 useful, hints are not exploited as deferred objects are
                 not visible to memory allocators. We introduce
                 Prudence, a dynamic memory allocator, that is tightly
                 integrated with the synchronization mechanism to ensure
                 visibility of deferred objects to the memory allocator.
                 Such an integration enables Prudence to (i) identify
                 the safe time to reclaim deferred objects' memory, (ii)
                 have an inclusive view of the allocated, free and
                 about-to-be-freed objects, and (iii) exploit
                 optimizations based on the hints about the future
                 during important state transitions. Our evaluation in
                 the Linux kernel shows that Prudence integrated with
                 RCU performs 3.9X to 28X better in micro-benchmarks
                 compared to SLUB, a recent memory allocator in the
                 Linux kernel. It also improves the overall performance
                 perceptibly (4\%-18\%) for a mix of widely used
                 synthetic and application benchmarks. Further, it
                 performs better (up to 98\%) in terms of object hits in
                 caches, object cache churns, slab churns, peak memory
                 usage and total fragmentation, when compared with the
                 SLUB allocator.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'16 conference proceedings.",
}

@Article{Mukkara:2016:WID,
  author =       "Anurag Mukkara and Nathan Beckmann and Daniel
                 Sanchez",
  title =        "{Whirlpool}: Improving Dynamic Cache Management with
                 Static Data Classification",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "2",
  pages =        "113--127",
  month =        may,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2980024.2872363",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:42 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Cache hierarchies are increasingly non-uniform and
                 difficult to manage. Several techniques, such as
                 scratchpads or reuse hints, use static information
                 about how programs access data to manage the memory
                 hierarchy. Static techniques are effective on regular
                 programs, but because they set fixed policies, they are
                 vulnerable to changes in program behavior or available
                 cache space. Instead, most systems rely on dynamic
                 caching policies that adapt to observed program
                 behavior. Unfortunately, dynamic policies spend
                 significant resources trying to learn how programs use
                 memory, and yet they often perform worse than a static
                 policy. We present Whirlpool, a novel approach that
                 combines static information with dynamic policies to
                 reap the benefits of each. Whirlpool statically
                 classifies data into pools based on how the program
                 uses memory. Whirlpool then uses dynamic policies to
                 tune the cache to each pool. Hence, rather than setting
                 policies statically, Whirlpool uses static analysis to
                 guide dynamic policies. We present both an API that
                 lets programmers specify pools manually and a profiling
                 tool that discovers pools automatically in unmodified
                 binaries. We evaluate Whirlpool on a state-of-the-art
                 NUCA cache. Whirlpool significantly outperforms prior
                 approaches: on sequential programs, Whirlpool improves
                 performance by up to 38\% and reduces data movement
                 energy by up to 53\%; on parallel programs, Whirlpool
                 improves performance by up to 67\% and reduces data
                 movement energy by up to 2.6x.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'16 conference proceedings.",
}

@Article{Jeon:2016:TTD,
  author =       "Myeongjae Jeon and Yuxiong He and Hwanju Kim and Sameh
                 Elnikety and Scott Rixner and Alan L. Cox",
  title =        "{TPC}: Target-Driven Parallelism Combining Prediction
                 and Correction to Reduce Tail Latency in Interactive
                 Services",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "2",
  pages =        "129--141",
  month =        may,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2980024.2872370",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:42 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "In interactive services such as web search,
                 recommendations, games and finance, reducing the tail
                 latency is crucial to provide fast response to every
                 user. Using web search as a driving example, we
                 systematically characterize interactive workload to
                 identify the opportunities and challenges for reducing
                 tail latency. We find that the workload consists of
                 mainly short requests that do not benefit from
                 parallelism, and a few long requests which
                 significantly impact the tail but exhibit high
                 parallelism speedup. This motivates estimating request
                 execution time, using a predictor, to identify long
                 requests and to parallelize them. Prediction, however,
                 is not perfect; a long request mispredicted as short is
                 likely to contribute to the server tail latency,
                 setting a ceiling on the achievable tail latency. We
                 propose TPC, an approach that combines prediction
                 information judiciously with dynamic correction for
                 inaccurate prediction. Dynamic correction increases
                 parallelism to accelerate a long request that is
                 mispredicted as short. TPC carefully selects the
                 appropriate target latencies based on system load and
                 parallelism efficiency to reduce tail latency. We
                 implement TPC and several prior approaches to compare
                 them experimentally on a single search server and on a
                 cluster of 40 search servers. The experimental results
                 show that TPC reduces the 99th- and 99.9th-percentile
                 latency by up to 40\% compared with the best prior
                 work. Moreover, we evaluate TPC on a finance server,
                 demonstrating its effectiveness on reducing tail
                 latency of interactive services beyond web search.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'16 conference proceedings.",
}

@Article{Brown:2016:HBS,
  author =       "Fraser Brown and Andres N{\"o}tzli and Dawson Engler",
  title =        "How to Build Static Checking Systems Using Orders of
                 Magnitude Less Code",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "2",
  pages =        "143--157",
  month =        may,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2980024.2872364",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:42 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Modern static bug finding tools are complex. They
                 typically consist of hundreds of thousands of lines of
                 code, and most of them are wedded to one language (or
                 even one compiler). This complexity makes the systems
                 hard to understand, hard to debug, and hard to retarget
                 to new languages, thereby dramatically limiting their
                 scope. This paper reduces checking system complexity by
                 addressing a fundamental assumption, the assumption
                 that checkers must depend on a full-blown language
                 specification and compiler front end. Instead, our
                 program checkers are based on drastically incomplete
                 language grammars (``micro-grammars'') that describe
                 only portions of a language relevant to a checker. As a
                 result, our implementation is tiny-roughly 2500 lines
                 of code, about two orders of magnitude smaller than a
                 typical system. We hope that this dramatic increase in
                 simplicity will allow people to use more checkers on
                 more systems in more languages. We implement our
                 approach in $ \mu $ chex, a language-agnostic framework
                 for writing static bug checkers. We use it to build
                 micro-grammar based checkers for six languages (C, the
                 C preprocessor, C++, Java, JavaScript, and Dart) and
                 find over 700 errors in real-world projects.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'16 conference proceedings.",
}

@Article{Zhang:2016:TED,
  author =       "Tong Zhang and Dongyoon Lee and Changhee Jung",
  title =        "{TxRace}: Efficient Data Race Detection Using
                 Commodity Hardware Transactional Memory",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "2",
  pages =        "159--173",
  month =        may,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2980024.2872384",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:42 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Detecting data races is important for debugging
                 shared-memory multithreaded programs, but the high
                 runtime overhead prevents the wide use of dynamic data
                 race detectors. This paper presents TxRace, a new
                 software data race detector that leverages commodity
                 hardware transactional memory (HTM) to speed up data
                 race detection. TxRace instruments a multithreaded
                 program to transform synchronization-free regions into
                 transactions, and exploits the conflict detection
                 mechanism of HTM for lightweight data race detection at
                 runtime. However, the limitations of the current
                 best-effort commodity HTMs expose several challenges in
                 using them for data race detection: (1) lack of ability
                 to pinpoint racy instructions, (2) false positives
                 caused by cache line granularity of conflict detection,
                 and (3) transactional aborts for non-conflict reasons
                 (e.g., capacity or unknown). To overcome these
                 challenges, TxRace performs lightweight HTM-based data
                 race detection at first, and occasionally switches to
                 slow yet precise data race detection only for the small
                 fraction of execution intervals in which potential
                 races are reported by HTM. According to the
                 experimental results, TxRace reduces the average
                 runtime overhead of dynamic data race detection from
                 11.68x to 4.65x with only a small number of false
                 negatives.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'16 conference proceedings.",
}

@Article{Amani:2016:CVH,
  author =       "Sidney Amani and Alex Hixon and Zilin Chen and
                 Christine Rizkallah and Peter Chubb and Liam O'Connor
                 and Joel Beeren and Yutaka Nagashima and Japheth Lim
                 and Thomas Sewell and Joseph Tuong and Gabriele Keller
                 and Toby Murray and Gerwin Klein and Gernot Heiser",
  title =        "{Cogent}: Verifying High-Assurance File System
                 Implementations",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "2",
  pages =        "175--188",
  month =        may,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2980024.2872404",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:42 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "We present an approach to writing and formally
                 verifying high-assurance file-system code in a
                 restricted language called Cogent, supported by a
                 certifying compiler that produces C code, high-level
                 specification of Cogent, and translation correctness
                 proofs. The language is strongly typed and guarantees
                 absence of a number of common file system
                 implementation errors. We show how verification effort
                 is drastically reduced for proving higher-level
                 properties of the file system implementation by
                 reasoning about the generated formal specification
                 rather than its low-level C code. We use the framework
                 to write two Linux file systems, and compare their
                 performance with their native C implementations.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'16 conference proceedings.",
}

@Article{Asmussen:2016:MHO,
  author =       "Nils Asmussen and Marcus V{\"o}lp and Benedikt
                 N{\"o}then and Hermann H{\"a}rtig and Gerhard
                 Fettweis",
  title =        "{M3}: a Hardware\slash Operating-System Co-Design to
                 Tame Heterogeneous Manycores",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "2",
  pages =        "189--203",
  month =        may,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2980024.2872371",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:42 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "In the last decade, the number of available cores
                 increased and heterogeneity grew. In this work, we ask
                 the question whether the design of the current
                 operating systems (OSes) is still appropriate if these
                 trends continue and lead to abundantly available but
                 heterogeneous cores, or whether it forces a fundamental
                 rethinking of how systems are designed. We argue that:
                 1. hiding heterogeneity behind a common hardware
                 interface unifies, to a large extent, the control and
                 coordination of cores and accelerators in the OS, 2.
                 isolating at the network-on-chip rather than with
                 processor features (like privileged mode, memory
                 management unit, ...), allows running untrusted code on
                 arbitrary cores, and 3. providing OS services via
                 protocols over the network-on-chip, instead of via
                 system calls, makes them accessible to arbitrary types
                 of cores as well. In summary, this turns accelerators
                 into first-class citizens and enables a single and
                 convenient programming environment for all cores
                 without the need to trust any application. In this
                 paper, we introduce network-on-chip-level isolation,
                 present the design of our microkernel-based OS, M3, and
                 the common hardware interface, and evaluate the
                 performance of our prototype in comparison to Linux. A
                 bit surprising, without using accelerators, M3
                 outperforms Linux in some application-level benchmarks
                 by more than a factor of five.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'16 conference proceedings.",
}

@Article{Liaqat:2016:SEE,
  author =       "Daniyal Liaqat and Silviu Jingoi and Eyal de Lara and
                 Ashvin Goel and Wilson To and Kevin Lee and Italo {De
                 Moraes Garcia} and Manuel Saldana",
  title =        "Sidewinder: an Energy Efficient and Developer Friendly
                 Heterogeneous Architecture for Continuous Mobile
                 Sensing",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "2",
  pages =        "205--215",
  month =        may,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2980024.2872398",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:42 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Applications that perform continuous sensing on mobile
                 phones have the potential to revolutionize everyday
                 life. Examples range from medical and health monitoring
                 applications, such as pedometers and fall detectors, to
                 participatory sensing applications, such as noise
                 pollution, traffic and seismic activity monitoring.
                 Unfortunately, current mobile devices are a poor match
                 for continuous sensing applications as they require the
                 device to remain awake for extended periods of time,
                 resulting in poor battery life. This paper presents
                 Sidewinder, a new approach towards offloading sensor
                 data processing to a low-power processor and waking up
                 the main processor when events of interest occur. This
                 approach differs from other heterogeneous architectures
                 in that developers are presented with a programming
                 interface that lets them construct application specific
                 wake-up conditions by linking together and
                 parameterizing predefined sensor data processing
                 algorithms. Our experiments indicate performance that
                 is comparable to approaches that provide fully
                 programmable offloading, but do so with a much simpler
                 programming interface that facilitates deployment and
                 portability.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'16 conference proceedings.",
}

@Article{Balkind:2016:OOS,
  author =       "Jonathan Balkind and Michael McKeown and Yaosheng Fu
                 and Tri Nguyen and Yanqi Zhou and Alexey Lavrov and
                 Mohammad Shahrad and Adi Fuchs and Samuel Payne and
                 Xiaohua Liang and Matthew Matl and David Wentzlaff",
  title =        "{OpenPiton}: an Open Source Manycore Research
                 Framework",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "2",
  pages =        "217--232",
  month =        may,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2980024.2872414",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:42 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Industry is building larger, more complex, manycore
                 processors on the back of strong institutional
                 knowledge, but academic projects face difficulties in
                 replicating that scale. To alleviate these difficulties
                 and to develop and share knowledge, the community needs
                 open architecture frameworks for simulation, synthesis,
                 and software exploration which support extensibility,
                 scalability, and configurability, alongside an
                 established base of verification tools and supported
                 software. In this paper we present OpenPiton, an open
                 source framework for building scalable architecture
                 research prototypes from 1 core to 500 million cores.
                 OpenPiton is the world's first open source,
                 general-purpose, multithreaded manycore processor and
                 framework. OpenPiton leverages the industry hardened
                 OpenSPARC T1 core with modifications and builds upon it
                 with a scratch-built, scalable uncore creating a
                 flexible, modern manycore design. In addition,
                 OpenPiton provides synthesis and backend scripts for
                 ASIC and FPGA to enable other researchers to bring
                 their designs to implementation. OpenPiton provides a
                 complete verification infrastructure of over 8000
                 tests, is supported by mature software tools, runs
                 full-stack multiuser Debian Linux, and is written in
                 industry standard Verilog. Multiple implementations of
                 OpenPiton have been created including a taped-out
                 25-core implementation in IBM's 32nm process and
                 multiple Xilinx FPGA prototypes.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'16 conference proceedings.",
}

@Article{Lustig:2016:CVM,
  author =       "Daniel Lustig and Geet Sethi and Margaret Martonosi
                 and Abhishek Bhattacharjee",
  title =        "{COATCheck}: Verifying Memory Ordering at the
                 Hardware-OS Interface",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "2",
  pages =        "233--247",
  month =        may,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2980024.2872399",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:42 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Modern computer systems include numerous compute
                 elements, from CPUs to GPUs to accelerators. Harnessing
                 their full potential requires well-defined,
                 properly-implemented memory consistency models (MCMs),
                 and low-level system functionality such as virtual
                 memory and address translation (AT). Unfortunately, it
                 is difficult to specify and implement hardware-OS
                 interactions correctly; in the past, many hardware and
                 OS specification mismatches have resulted in
                 implementation bugs in commercial processors. In an
                 effort to resolve this verification gap, this paper
                 makes the following contributions. First, we present
                 COATCheck, an address translation-aware framework for
                 specifying and statically verifying memory ordering
                 enforcement at the microarchitecture and operating
                 system levels. We develop a domain-specific language
                 for specifying ordering enforcement, for including
                 ordering-related OS events and hardware
                 micro-operations, and for programmatically enumerating
                 happens-before graphs. Using a fast and automated
                 static constraint solver, COATCheck can efficiently
                 analyze interesting and important memory ordering
                 scenarios for modern, high-performance, out-of-order
                 processors. Second, we show that previous work on
                 Virtual Address Memory Consistency (VAMC) does not
                 capture every translation-related ordering scenario of
                 interest, and that some such cases even fall outside
                 the traditional scope of consistency. We therefore
                 introduce the term transistency model to describe the
                 superset of consistency which captures all
                 translation-aware sets of ordering rules.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'16 conference proceedings.",
}

@Article{Markuze:2016:TIP,
  author =       "Alex Markuze and Adam Morrison and Dan Tsafrir",
  title =        "True {IOMMU} Protection from {DMA} Attacks: When Copy
                 is Faster than Zero Copy",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "2",
  pages =        "249--262",
  month =        may,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2980024.2872379",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:42 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Malicious I/O devices might compromise the OS using
                 DMAs. The OS therefore utilizes the IOMMU to map and
                 unmap every target buffer right before and after its
                 DMA is processed, thereby restricting DMAs to their
                 designated locations. This usage model, however, is not
                 truly secure for two reasons: (1) it provides
                 protection at page granularity only, whereas DMA
                 buffers can reside on the same page as other data; and
                 (2) it delays DMA buffer unmaps due to performance
                 considerations, creating a vulnerability window in
                 which devices can access in-use memory. We propose that
                 OSes utilize the IOMMU differently, in a manner that
                 eliminates these two flaws. Our new usage model
                 restricts device access to a set of shadow DMA buffers
                 that are never unmapped, and it copies DMAed data
                 to/from these buffers, thus providing sub-page
                 protection while eliminating the aforementioned
                 vulnerability window. Our key insight is that the cost
                 of interacting with, and synchronizing access to the
                 slow IOMMU hardware---required for zero-copy protection
                 against devices---make copying preferable to
                 zero-copying. We implement our model in Linux and
                 evaluate it with standard networking benchmarks
                 utilizing a 40,Gb/s NIC. We demonstrate that despite
                 being more secure than the safest preexisting usage
                 model, our approach provides up to 5x higher
                 throughput. Additionally, whereas it is inherently less
                 scalable than an IOMMU-less (unprotected) system, our
                 approach incurs only 0\%--25\% performance degradation
                 in comparison.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'16 conference proceedings.",
}

@Article{Awad:2016:SSZ,
  author =       "Amro Awad and Pratyusa Manadhata and Stuart Haber and
                 Yan Solihin and William Horne",
  title =        "{Silent Shredder}: Zero-Cost Shredding for Secure
                 Non-Volatile Main Memory Controllers",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "2",
  pages =        "263--276",
  month =        may,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2980024.2872377",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:42 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "As non-volatile memory (NVM) technologies are expected
                 to replace DRAM in the near future, new challenges have
                 emerged. For example, NVMs have slow and
                 power-consuming writes, and limited write endurance. In
                 addition, NVMs have a data remanence vulnerability,
                 i.e., they retain data for a long time after being
                 powered off. NVM encryption alleviates the
                 vulnerability, but exacerbates the limited endurance by
                 increasing the number of writes to memory. We observe
                 that, in current systems, a large percentage of main
                 memory writes result from data shredding in operating
                 systems, a process of zeroing out physical pages before
                 mapping them to new processes, in order to protect
                 previous processes' data. In this paper, we propose
                 Silent Shredder, which repurposes initialization
                 vectors used in standard counter mode encryption to
                 completely eliminate the data shredding writes. Silent
                 Shredder also speeds up reading shredded cache lines,
                 and hence reduces power consumption and improves
                 overall performance. To evaluate our design, we run
                 three PowerGraph applications and 26 multi-programmed
                 workloads from the SPEC 2006 suite, on a gem5-based
                 full system simulator. Silent Shredder eliminates an
                 average of 48.6\% of the writes in the initialization
                 and graph construction phases. It speeds up main memory
                 reads by 3.3 times, and improves the number of
                 instructions per cycle (IPC) by 6.4\% on average.
                 Finally, we discuss several use cases, including
                 virtual machines' data isolation and user-level large
                 data initialization, where Silent Shredder can be used
                 effectively at no extra cost.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'16 conference proceedings.",
}

@Article{Kwon:2016:SPT,
  author =       "Youngjin Kwon and Alan M. Dunn and Michael Z. Lee and
                 Owen S. Hofmann and Yuanzhong Xu and Emmett Witchel",
  title =        "{Sego}: Pervasive Trusted Metadata for Efficiently
                 Verified Untrusted System Services",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "2",
  pages =        "277--290",
  month =        may,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2980024.2872372",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:42 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Sego is a hypervisor-based system that gives strong
                 privacy and integrity guarantees to trusted
                 applications, even when the guest operating system is
                 compromised or hostile. Sego verifies operating system
                 services, like the file system, instead of replacing
                 them. By associating trusted metadata with user data
                 across all system devices, Sego verifies system
                 services more efficiently than previous systems,
                 especially services that depend on data contents. We
                 extensively evaluate Sego's performance on real
                 workloads and implement a kernel fault injector to
                 validate Sego's file system-agnostic crash consistency
                 and recovery protocol.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'16 conference proceedings.",
}

@Article{Tsafrir:2016:SAW,
  author =       "Dan Tsafrir",
  title =        "Synopsis of the {ASPLOS '16 Wild and Crazy Ideas
                 (WACI)} Invited-Speakers Session",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "2",
  pages =        "291--294",
  month =        may,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2980024.2876512",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:42 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "The Wild and Crazy Ideas (WACI) session is a
                 longstanding tradition at ASPLOS, soliciting talks that
                 consist of forward-looking, visionary, inspiring,
                 creative, far out or just plain amazing ideas presented
                 in an exciting way. (Amusing elements in the
                 presentations are tolerated ;-) but are in fact
                 optional.) The first WACI session took place in 1998.
                 Back then, the call for talks included a problem
                 statement, which contended that ``papers usually do not
                 get admitted to [such conferences as] ISCA or ASPLOS
                 unless the systems that they describe are mature enough
                 to run [some standard benchmark suites, which] has a
                 chilling effect on the idea generation process ---
                 encouraging incremental research'' [1]. The 1998 WACI
                 session turned out to be a great success. Its webpage
                 states that ``there were 42 submissions [competing
                 over] only eight time slots, [which resulted in] this
                 session [having] a lower acceptance rate than the
                 conference itself'' [2]. But the times they are
                 a-changin' [3], and the WACI session no longer enjoys
                 that many submissions (Figure (1)), perhaps because
                 nowadays there exist many forums for researchers to
                 describe/discuss their preliminary ideas, including:
                 the ``hot topics in'' workshops [4--7]; a journal like
                 CAL, dedicated to early results [8]; main conferences
                 soliciting short submissions describing ``original or
                 unconventional ideas at a preliminary stage'' in
                 addition to regular papers [9]; and the many workshops
                 co-located with main conferences, like ISCA '15, which
                 hosted thirteen such workshops [10]. Regardless of the
                 reason for the declining number of submissions, this
                 time we've decided to organize the WACI session
                 differently to ensure its continued high quality.
                 Instead of soliciting talks via an open call and hoping
                 for the best, we proactively invited speakers whom we
                 believe are capable of delivering excellent WACI
                 presentations. That is, this year's WACI session
                 consists exclusively of invited speakers. Filling up
                 the available slots turned out to be fairly easy, as
                 most of the researchers we invited promptly accepted
                 our invitation. The duration of each talk was set to be
                 eight minutes (exactly as in the first WACI session
                 from 1998) plus two minutes for questions. The talks
                 are outlined below. We believe they are interesting and
                 exciting, and we hope the attendees of the session will
                 find them stimulating and insightful.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'16 conference proceedings.",
}

@Article{Williams:2016:BIC,
  author =       "R. Stanley Williams",
  title =        "Brain Inspired Computing",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "2",
  pages =        "295--295",
  month =        may,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2980024.2872417",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:42 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'16 conference proceedings.",
}

@Article{Phothilimthana:2016:SS,
  author =       "Phitchaya Mangpo Phothilimthana and Aditya Thakur and
                 Rastislav Bodik and Dinakar Dhurjati",
  title =        "Scaling up Superoptimization",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "2",
  pages =        "297--310",
  month =        may,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2980024.2872387",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:42 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Developing a code optimizer is challenging, especially
                 for new, idiosyncratic ISAs. Superoptimization can, in
                 principle, discover machine-specific optimizations
                 automatically by searching the space of all instruction
                 sequences. If we can increase the size of code
                 fragments a superoptimizer can optimize, we will be
                 able to discover more optimizations. We develop LENS, a
                 search algorithm that increases the size of code a
                 superoptimizer can synthesize by rapidly pruning away
                 invalid candidate programs. Pruning is achieved by
                 selectively refining the abstraction under which
                 candidates are considered equivalent, only in the
                 promising part of the candidate space. LENS also uses a
                 bidirectional search strategy to prune the candidate
                 space from both forward and backward directions. These
                 pruning strategies allow LENS to solve twice as many
                 benchmarks as existing enumerative search algorithms,
                 while LENS is about 11-times faster. Additionally, we
                 increase the effective size of the superoptimized
                 fragments by relaxing the correctness condition using
                 contexts (surrounding code). Finally, we combine LENS
                 with complementary search techniques into a cooperative
                 superoptimizer, which exploits the stochastic search to
                 make random jumps in a large candidate space, and a
                 symbolic (SAT-solver-based) search to synthesize
                 arbitrary constants. While existing superoptimizers
                 consistently solve 9--16 out of 32 benchmarks, the
                 cooperative superoptimizer solves 29 benchmarks. It can
                 synthesize code fragments that are up to 82\% faster
                 than code generated by gcc -O3 from WiBench and
                 MiBench.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'16 conference proceedings.",
}

@Article{Hasabnis:2016:LAI,
  author =       "Niranjan Hasabnis and R. Sekar",
  title =        "Lifting Assembly to Intermediate Representation: a
                 Novel Approach Leveraging Compilers",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "2",
  pages =        "311--324",
  month =        may,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2980024.2872380",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:42 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Translating low-level machine instructions into
                 higher-level intermediate language (IL) is one of the
                 central steps in many binary analysis and
                 instrumentation systems. Existing systems build such
                 translators manually. As a result, it takes a great
                 deal of effort to support new architectures. Even for
                 widely deployed architectures, full instruction sets
                 may not be modeled, e.g., mature systems such as
                 Valgrind still lack support for AVX, FMA4 and SSE4.1
                 for x86 processors. To overcome these difficulties, we
                 propose a novel approach that leverages knowledge about
                 instruction set semantics that is already embedded into
                 modern compilers such as GCC. In particular, we present
                 a learning-based approach for automating the
                 translation of assembly instructions to a compiler's
                 architecture-neutral IL. We present an experimental
                 evaluation that demonstrates the ability of our
                 approach to easily support many architectures (x86, ARM
                 and AVR), including their advanced instruction sets.
                 Our implementation is available as open-source
                 software.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'16 conference proceedings.",
}

@Article{Muralidharan:2016:AAC,
  author =       "Saurav Muralidharan and Amit Roy and Mary Hall and
                 Michael Garland and Piyush Rai",
  title =        "Architecture-Adaptive Code Variant Tuning",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "2",
  pages =        "325--338",
  month =        may,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2980024.2872411",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:42 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Code variants represent alternative implementations of
                 a computation, and are common in high-performance
                 libraries and applications to facilitate selecting the
                 most appropriate implementation for a specific
                 execution context (target architecture and input
                 dataset). Automating code variant selection typically
                 relies on machine learning to construct a model during
                 an offline learning phase that can be quickly queried
                 at runtime once the execution context is known. In this
                 paper, we define a new approach called
                 architecture-adaptive code variant tuning, where the
                 variant selection model is learned on a set of source
                 architectures, and then used to predict variants on a
                 new target architecture without having to repeat the
                 training process. We pose this as a multi-task learning
                 problem, where each source architecture corresponds to
                 a task; we use device features in the construction of
                 the variant selection model. This work explores the
                 effectiveness of multi-task learning and the impact of
                 different strategies for device feature selection. We
                 evaluate our approach on a set of benchmarks and a
                 collection of six NVIDIA GPU architectures from three
                 distinct generations. We achieve performance results
                 that are mostly comparable to the previous approach of
                 tuning for a single GPU architecture without having to
                 repeat the learning phase.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'16 conference proceedings.",
}

@Article{Lin:2016:SKT,
  author =       "Xiaofeng Lin and Yu Chen and Xiaodong Li and Junjie
                 Mao and Jiaquan He and Wei Xu and Yuanchun Shi",
  title =        "Scalable Kernel {TCP} Design and Implementation for
                 Short-Lived Connections",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "2",
  pages =        "339--352",
  month =        may,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2980024.2872391",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:42 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "With the rapid growth of network bandwidth, increases
                 in CPU cores on a single machine, and application API
                 models demanding more short-lived connections, a
                 scalable TCP stack is performance-critical. Although
                 many clean-state designs have been proposed, production
                 environments still call for a bottom-up parallel TCP
                 stack design that is backward-compatible with existing
                 applications. We present Fastsocket, a BSD
                 Socket-compatible and scalable kernel socket design,
                 which achieves table-level connection partition in TCP
                 stack and guarantees connection locality for both
                 passive and active connections. Fastsocket architecture
                 is a ground up partition design, from NIC interrupts
                 all the way up to applications, which naturally
                 eliminates various lock contentions in the entire
                 stack. Moreover, Fastsocket maintains the full
                 functionality of the kernel TCP stack and
                 BSD-socket-compatible API, and thus applications need
                 no modifications. Our evaluations show that Fastsocket
                 achieves a speedup of 20.4x on a 24-core machine under
                 a workload of short-lived connections, outperforming
                 the state-of-the-art Linux kernel TCP implementations.
                 When scaling up to 24 CPU cores, Fastsocket increases
                 the throughput of Nginx and HAProxy by 267\% and 621\%
                 respectively compared with the base Linux kernel. We
                 also demonstrate that Fastsocket can achieve
                 scalability and preserve BSD socket API at the same
                 time. Fastsocket is already deployed in the production
                 environment of Sina WeiBo, serving 50 million daily
                 active users and billions of requests per day.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'16 conference proceedings.",
}

@Article{Hajj:2016:SPM,
  author =       "Izzat {El Hajj} and Alexander Merritt and Gerd
                 Zellweger and Dejan Milojicic and Reto Achermann and
                 Paolo Faraboschi and Wen-mei Hwu and Timothy Roscoe and
                 Karsten Schwan",
  title =        "{SpaceJMP}: Programming with Multiple Virtual Address
                 Spaces",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "2",
  pages =        "353--368",
  month =        may,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2980024.2872366",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:42 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Memory-centric computing demands careful organization
                 of the virtual address space, but traditional methods
                 for doing so are inflexible and inefficient. If an
                 application wishes to address larger physical memory
                 than virtual address bits allow, if it wishes to
                 maintain pointer-based data structures beyond process
                 lifetimes, or if it wishes to share large amounts of
                 memory across simultaneously executing processes,
                 legacy interfaces for managing the address space are
                 cumbersome and often incur excessive overheads. We
                 propose a new operating system design that promotes
                 virtual address spaces to first-class citizens,
                 enabling process threads to attach to, detach from, and
                 switch between multiple virtual address spaces. Our
                 work enables data-centric applications to utilize vast
                 physical memory beyond the virtual range, represent
                 persistent pointer-rich data structures without special
                 pointer representations, and share large amounts of
                 memory between processes efficiently. We describe our
                 prototype implementations in the DragonFly BSD and
                 Barrelfish operating systems. We also present
                 programming semantics and a compiler transformation to
                 detect unsafe pointer usage. We demonstrate the
                 benefits of our work on data-intensive applications
                 such as the GUPS benchmark, the SAMTools genomics
                 workflow, and the Redis key-value store.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'16 conference proceedings.",
}

@Article{Lin:2016:MTP,
  author =       "Felix Xiaozhu Lin and Xu Liu",
  title =        "{{\ttf memif}}: Towards Programming Heterogeneous
                 Memory Asynchronously",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "2",
  pages =        "369--383",
  month =        may,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2980024.2872401",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:42 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "To harness a heterogeneous memory hierarchy, it is
                 advantageous to integrate application knowledge in
                 guiding frequent memory move, i.e., replicating or
                 migrating virtual memory regions. To this end, we
                 present memif, a protected OS service for asynchronous,
                 hardware-accelerated memory move. Compared to the state
                 of the art --- page migration in Linux, memif incurs
                 low overhead and low latency; in order to do so, it not
                 only redefines the semantics of kernel interface but
                 also overhauls the underlying mechanisms, including
                 request/completion management, race handling, and DMA
                 engine configuration. We implement memif in Linux for a
                 server-class system-on-chip that features heterogeneous
                 memories. Compared to the current Linux page migration,
                 memif reduces CPU usage by up to 15\% for small pages
                 and by up to 38x for large pages; in continuously
                 serving requests, memif has no need for request
                 batching and reduces latency by up to 63\%. By crafting
                 a small runtime atop memif, we improve the throughputs
                 for a set of streaming workloads by up to 33\%.
                 Overall, memif has opened the door to software
                 management of heterogeneous memory.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'16 conference proceedings.",
}

@Article{Kim:2016:NEN,
  author =       "Wook-Hee Kim and Jinwoong Kim and Woongki Baek and
                 Beomseok Nam and Youjip Won",
  title =        "{NVWAL}: Exploiting {NVRAM} in Write-Ahead Logging",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "2",
  pages =        "385--398",
  month =        may,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2980024.2872392",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:42 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Emerging byte-addressable non-volatile memory is
                 considered an alternative storage device for database
                 logs that require persistency and high performance. In
                 this work, we develop NVWAL (NVRAM Write-Ahead Logging)
                 for SQLite. The contribution of NVWAL consists of three
                 elements: (i) byte-granularity differential logging
                 that effectively eliminates the excessive I/O overhead
                 of filesystem-based logging or journaling, (ii)
                 transaction-aware lazy synchronization that reduces
                 cache synchronization overhead by two-thirds, and (iii)
                 user-level heap management of the NVRAM persistent WAL
                 structure, which reduces the overhead of managing
                 persistent objects. We implemented NVWAL in SQLite and
                 measured the performance on a Nexus 5 smartphone and an
                 NVRAM emulation board --- Tuna. Our performance study
                 shows the following: (i) the overhead of enforcing
                 strict ordering of NVRAM writes can be reduced via
                 NVRAM-aware transaction management. (ii) From the
                 application performance point of view, the overhead of
                 guaranteeing failure atomicity is negligible; the cache
                 line flush overhead accounts for only 0.8~4.6\% of
                 transaction execution time. Therefore, application
                 performance is much less sensitive to the NVRAM
                 performance than we expected. Decreasing the NVRAM
                 latency by one-fifth (from 1942 nsec to 437 nsec),
                 SQLite achieves a mere 4\% performance gain (from 2517
                 ins/sec to 2621 ins/sec). (iii) Overall, when the write
                 latency of NVRAM is 2 usec, NVWAL increases SQLite
                 performance by at least 10x compared to that of WAL on
                 flash memory (from 541 ins/sec to 5812 ins/sec).",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'16 conference proceedings.",
}

@Article{Kolli:2016:HPT,
  author =       "Aasheesh Kolli and Steven Pelley and Ali Saidi and
                 Peter M. Chen and Thomas F. Wenisch",
  title =        "High-Performance Transactions for Persistent
                 Memories",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "2",
  pages =        "399--411",
  month =        may,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2980024.2872381",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:42 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Emerging non-volatile memory (NVRAM) technologies
                 offer the durability of disk with the
                 byte-addressability of DRAM. These devices will allow
                 software to access persistent data structures directly
                 in NVRAM using processor loads and stores, however,
                 ensuring consistency of persistent data across power
                 failures and crashes is difficult. Atomic, durable
                 transactions are a widely used abstraction to enforce
                 such consistency. Implementing transactions on NVRAM
                 requires the ability to constrain the order of NVRAM
                 writes, for example, to ensure that a transaction's log
                 record is complete before it is marked committed. Since
                 NVRAM write latencies are expected to be high,
                 minimizing these ordering constraints is critical for
                 achieving high performance. Recent work has proposed
                 programming interfaces to express NVRAM write ordering
                 constraints to hardware so that NVRAM writes may be
                 coalesced and reordered while preserving necessary
                 constraints. Unfortunately, a straightforward
                 implementation of transactions under these interfaces
                 imposes unnecessary constraints. We show how to remove
                 these dependencies through a variety of techniques,
                 notably, deferring commit until after locks are
                 released. We present a comprehensive analysis
                 contrasting two transaction designs across three NVRAM
                 programming interfaces, demonstrating up to 2.5x
                 speedup.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'16 conference proceedings.",
}

@Article{Guo:2016:HDI,
  author =       "Qing Guo and Karin Strauss and Luis Ceze and Henrique
                 S. Malvar",
  title =        "High-Density Image Storage Using Approximate Memory
                 Cells",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "2",
  pages =        "413--426",
  month =        may,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2980024.2872413",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:42 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "This paper proposes tailoring image encoding for an
                 approximate storage substrate. We demonstrate that
                 indiscriminately storing encoded images in approximate
                 memory generates unacceptable and uncontrollable
                 quality degradation. The key finding is that errors in
                 the encoded bit streams have non-uniform impact on the
                 decoded image quality. We develop a methodology to
                 determine the relative importance of encoded bits and
                 store them in an approximate storage substrate. The
                 storage cells are optimized to reduce error rate via
                 biasing and are tuned to meet the desired reliability
                 requirement via selective error correction. In a case
                 study with the progressive transform codec (PTC), a
                 precursor to JPEG XR, the proposed approximate image
                 storage system exhibits a 2.7x increase in density of
                 pixels per silicon volume under bounded error rates,
                 and this achievement is additive to the storage savings
                 of PTC compression.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'16 conference proceedings.",
}

@Article{Izraelevitz:2016:FAP,
  author =       "Joseph Izraelevitz and Terence Kelly and Aasheesh
                 Kolli",
  title =        "Failure-Atomic Persistent Memory Updates via {JUSTDO}
                 Logging",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "2",
  pages =        "427--442",
  month =        may,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2980024.2872410",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:42 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Persistent memory invites applications to manipulate
                 persistent data via load and store instructions.
                 Because failures during updates may destroy transient
                 data (e.g., in CPU registers), preserving data
                 integrity in the presence of failures requires
                 failure-atomic bundles of updates. Prior failure
                 atomicity approaches for persistent memory entail
                 overheads due to logging and CPU cache flushing.
                 Persistent caches can eliminate the need for flushing,
                 but conventional logging remains complex and memory
                 intensive. We present the design and implementation of
                 JUSTDO logging, a new failure atomicity mechanism that
                 greatly reduces the memory footprint of logs,
                 simplifies log management, and enables fast parallel
                 recovery following failure. Crash-injection tests
                 confirm that JUSTDO logging preserves application data
                 integrity and performance evaluations show that it
                 improves throughput 3x or more compared with a
                 state-of-the-art alternative for a spectrum of
                 data-intensive algorithms.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'16 conference proceedings.",
}

@Article{Han:2016:IMD,
  author =       "Jaeung Han and Seungheun Jeon and Young-ri Choi and
                 Jaehyuk Huh",
  title =        "Interference Management for Distributed Parallel
                 Applications in Consolidated Clusters",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "2",
  pages =        "443--456",
  month =        may,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2980024.2872388",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:42 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Consolidating multiple applications on a system can
                 improve the overall resource utilization of data center
                 systems. However, such consolidation can adversely
                 affect the performance of some applications due to
                 interference caused by resource contention. Despite
                 many prior studies on the interference effects in
                 single-node systems, the interference behaviors of
                 distributed parallel applications have not been
                 investigated thoroughly. With distributed applications,
                 a local interference in a node can affect the whole
                 execution of an application spanning many nodes. This
                 paper studies an interference modeling methodology for
                 distributed applications to predict their performance
                 under interference effects in consolidated clusters.
                 This study first characterizes the effects of
                 interference for various distributed applications over
                 different interference settings, and analyzes how
                 diverse interference intensities on multiple nodes
                 affect the overall performance. Based on the
                 characterization, this study proposes a static
                 profiling-based model for interference propagation and
                 heterogeneity behaviors. In addition, this paper
                 presents use case studies of the modeling method, two
                 interference-aware placement techniques for
                 consolidated virtual clusters, which attempt to
                 maximize the overall throughput or to guarantee the
                 quality-of-service.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'16 conference proceedings.",
}

@Article{Maas:2016:THL,
  author =       "Martin Maas and Krste Asanovi{\'c} and Tim Harris and
                 John Kubiatowicz",
  title =        "{Taurus}: a Holistic Language Runtime System for
                 Coordinating Distributed Managed-Language
                 Applications",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "2",
  pages =        "457--471",
  month =        may,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2980024.2872386",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:42 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Many distributed workloads in today's data centers are
                 written in managed languages such as Java or Ruby.
                 Examples include big data frameworks such as Hadoop,
                 data stores such as Cassandra or applications such as
                 the SOLR search engine. These workloads typically run
                 across many independent language runtime systems on
                 different nodes. This setup represents a source of
                 inefficiency, as these language runtime systems are
                 unaware of each other. For example, they may perform
                 Garbage Collection at times that are locally reasonable
                 but not in a distributed setting. We address these
                 problems by introducing the concept of a Holistic
                 Runtime System that makes runtime-level decisions for
                 the entire distributed application rather than locally.
                 We then present Taurus, a Holistic Runtime System
                 prototype. Taurus is a JVM drop-in replacement,
                 requires almost no configuration and can run unmodified
                 off-the-shelf Java applications. Taurus enforces
                 user-defined coordination policies and provides a DSL
                 for writing these policies. By applying Taurus to
                 Garbage Collection, we demonstrate the potential of
                 such a system and use it to explore coordination
                 strategies for the runtime systems of real-world
                 distributed applications, to improve application
                 performance and address tail-latencies in
                 latency-sensitive workloads.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'16 conference proceedings.",
}

@Article{Delimitrou:2016:HRE,
  author =       "Christina Delimitrou and Christos Kozyrakis",
  title =        "{HCloud}: Resource-Efficient Provisioning in Shared
                 Cloud Systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "2",
  pages =        "473--488",
  month =        may,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2980024.2872365",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:42 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Cloud computing promises flexibility and high
                 performance for users and cost efficiency for
                 operators. To achieve this, cloud providers offer
                 instances of different sizes, both as long-term
                 reservations and short-term, on-demand allocations.
                 Unfortunately, determining the best provisioning
                 strategy is a complex, multi-dimensional problem that
                 depends on the load fluctuation and duration of
                 incoming jobs, and the performance unpredictability and
                 cost of resources. We first compare the two main
                 provisioning strategies (reserved and on-demand
                 resources) on Google Compute Engine (GCE) using three
                 representative workload scenarios with batch and
                 latency-critical applications. We show that either
                 approach is suboptimal for performance or cost. We then
                 present HCloud, a hybrid provisioning system that uses
                 both reserved and on-demand resources. HCloud
                 determines which jobs should be mapped to reserved
                 versus on-demand resources based on overall load, and
                 resource unpredictability. It also determines the
                 optimal instance size an application needs to satisfy
                 its Quality of Service (QoS) constraints. We
                 demonstrate that hybrid configurations improve
                 performance by 2.1x compared to fully on-demand
                 provisioning, and reduce cost by 46\% compared to fully
                 reserved systems. We also show that hybrid strategies
                 are robust to variation in system and job parameters,
                 such as cost and system load.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'16 conference proceedings.",
}

@Article{Yu:2016:CWM,
  author =       "Xiao Yu and Pallavi Joshi and Jianwu Xu and Guoliang
                 Jin and Hui Zhang and Guofei Jiang",
  title =        "{CloudSeer}: Workflow Monitoring of Cloud
                 Infrastructures via Interleaved Logs",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "2",
  pages =        "489--502",
  month =        may,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2980024.2872407",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:42 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Cloud infrastructures provide a rich set of management
                 tasks that operate computing, storage, and networking
                 resources in the cloud. Monitoring the executions of
                 these tasks is crucial for cloud providers to promptly
                 find and understand problems that compromise cloud
                 availability. However, such monitoring is challenging
                 because there are multiple distributed service
                 components involved in the executions. CloudSeer
                 enables effective workflow monitoring. It takes a
                 lightweight non-intrusive approach that purely works on
                 interleaved logs widely existing in cloud
                 infrastructures. CloudSeer first builds an automaton
                 for the workflow of each management task based on
                 normal executions, and then it checks log messages
                 against a set of automata for workflow divergences in a
                 streaming manner. Divergences found during the checking
                 process indicate potential execution problems, which
                 may or may not be accompanied by error log messages.
                 For each potential problem, CloudSeer outputs necessary
                 context information including the affected task
                 automaton and related log messages hinting where the
                 problem occurs to help further diagnosis. Our
                 experiments on OpenStack, a popular open-source cloud
                 infrastructure, show that CloudSeer's efficiency and
                 problem-detection capability are suitable for online
                 monitoring.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'16 conference proceedings.",
}

@Article{Kwon:2016:LCI,
  author =       "Yonghwi Kwon and Dohyeong Kim and William Nick Sumner
                 and Kyungtae Kim and Brendan Saltaformaggio and Xiangyu
                 Zhang and Dongyan Xu",
  title =        "{LDX}: Causality Inference by Lightweight Dual
                 Execution",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "2",
  pages =        "503--515",
  month =        may,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2980024.2872395",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:42 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Causality inference, such as dynamic taint analysis,
                 has many applications (e.g., information leak
                 detection). It determines whether an event e is
                 causally dependent on a preceding event c during
                 execution. We develop a new causality inference engine
                 LDX. Given an execution, it spawns a slave execution,
                 in which it mutates c and observes whether any change
                 is induced at e. To preclude non-determinism, LDX
                 couples the executions by sharing syscall outcomes. To
                 handle path differences induced by the perturbation, we
                 develop a novel on-the-fly execution alignment scheme
                 that maintains a counter to reflect the progress of
                 execution. The scheme relies on program analysis and
                 compiler transformation. LDX can effectively detect
                 information leak and security attacks with an average
                 overhead of 6.08\% while running the master and the
                 slave concurrently on separate CPUs, much lower than
                 existing systems that require instruction level
                 monitoring. Furthermore, it has much better accuracy in
                 causality inference.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'16 conference proceedings.",
}

@Article{Leesatapornwongsa:2016:TTN,
  author =       "Tanakorn Leesatapornwongsa and Jeffrey F. Lukman and
                 Shan Lu and Haryadi S. Gunawi",
  title =        "{TaxDC}: a Taxonomy of Non-Deterministic Concurrency
                 Bugs in Datacenter Distributed Systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "2",
  pages =        "517--530",
  month =        may,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2980024.2872374",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:42 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "We present TaxDC, the largest and most comprehensive
                 taxonomy of non-deterministic concurrency bugs in
                 distributed systems. We study 104 distributed
                 concurrency (DC) bugs from four widely-deployed
                 cloud-scale datacenter distributed systems, Cassandra,
                 Hadoop MapReduce, HBase and ZooKeeper. We study DC-bug
                 characteristics along several axes of analysis such as
                 the triggering timing condition and input
                 preconditions, error and failure symptoms, and fix
                 strategies, collectively stored as 2,083 classification
                 labels in TaxDC database. We discuss how our study can
                 open up many new research directions in combating DC
                 bugs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'16 conference proceedings.",
}

@Article{Mao:2016:RFR,
  author =       "Junjie Mao and Yu Chen and Qixue Xiao and Yuanchun
                 Shi",
  title =        "{RID}: Finding Reference Count Bugs with Inconsistent
                 Path Pair Checking",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "2",
  pages =        "531--544",
  month =        may,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2980024.2872389",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:42 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Reference counts are widely used in OS kernels for
                 resource management. However, reference counts are not
                 trivial to be used correctly in large scale programs
                 because it is left to developers to make sure that an
                 increment to a reference count is always paired with a
                 decrement. This paper proposes inconsistent path pair
                 checking, a novel technique that can statically
                 discover bugs related to reference counts without
                 knowing how reference counts should be changed in a
                 function. A prototype called RID is implemented and
                 evaluations show that RID can discover more than 80
                 bugs which were confirmed by the developers in the
                 latest Linux kernel. The results also show that RID
                 tends to reveal bugs caused by developers'
                 misunderstanding on API specifications or error
                 conditions that are not handled properly.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'16 conference proceedings.",
}

@Article{Zhang:2016:MPU,
  author =       "Huazhe Zhang and Henry Hoffmann",
  title =        "Maximizing Performance Under a Power Cap: a Comparison
                 of Hardware, Software, and Hybrid Techniques",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "2",
  pages =        "545--559",
  month =        may,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2980024.2872375",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:42 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Power and thermal dissipation constrain multicore
                 performance scaling. Modern processors are built such
                 that they could sustain damaging levels of power
                 dissipation, creating a need for systems that can
                 implement processor power caps. A particular challenge
                 is developing systems that can maximize performance
                 within a power cap, and approaches have been proposed
                 in both software and hardware. Software approaches are
                 flexible, allowing multiple hardware resources to be
                 coordinated for maximum performance, but software is
                 slow, requiring a long time to converge to the power
                 target. In contrast, hardware power capping quickly
                 converges to the the power cap, but only manages
                 voltage and frequency, limiting its potential
                 performance. In this work we propose PUPiL, a hybrid
                 software/hardware power capping system. Unlike previous
                 approaches, PUPiL combines hardware's fast reaction
                 time with software's flexibility. We implement PUPiL on
                 real Linux/x86 platform and compare it to Intel's
                 commercial hardware power capping system for both
                 single and multi-application workloads. We find PUPiL
                 provides the same reaction time as Intel's hardware
                 with significantly higher performance. On average,
                 PUPiL outperforms hardware by from 1:18-2:4 depending
                 on workload and power target. Thus, PUPiL provides a
                 promising way to enforce power caps with greater
                 performance than current state-of-the-art hardware-only
                 approaches.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'16 conference proceedings.",
}

@Article{Fan:2016:CSG,
  author =       "Songchun Fan and Seyed Majid Zahedi and Benjamin C.
                 Lee",
  title =        "The Computational Sprinting Game",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "2",
  pages =        "561--575",
  month =        may,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2980024.2872383",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:42 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Computational sprinting is a class of mechanisms that
                 boost performance but dissipate additional power. We
                 describe a sprinting architecture in which many,
                 independent chip multiprocessors share a power supply
                 and sprints are constrained by the chips' thermal
                 limits and the rack's power limits. Moreover, we
                 present the computational sprinting game, a multi-agent
                 perspective on managing sprints. Strategic agents
                 decide whether to sprint based on application phases
                 and system conditions. The game produces an equilibrium
                 that improves task throughput for data analytics
                 workloads by 4-6$ \times $ over prior greedy heuristics
                 and performs within 90\% of an upper bound on
                 throughput from a globally optimized policy.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'16 conference proceedings.",
}

@Article{Colin:2016:EIF,
  author =       "Alexei Colin and Graham Harvey and Brandon Lucia and
                 Alanson P. Sample",
  title =        "An Energy-interference-free Hardware-Software Debugger
                 for Intermittent Energy-harvesting Systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "2",
  pages =        "577--589",
  month =        may,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2980024.2872409",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:42 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Energy-autonomous computing devices have the potential
                 to extend the reach of computing to a scale beyond
                 either wired or battery-powered systems. However, these
                 devices pose a unique set of challenges to application
                 developers who lack both hardware and software support
                 tools. Energy harvesting devices experience power
                 intermittence which causes the system to reset and
                 power-cycle unpredictably, tens to hundreds of times
                 per second. This can result in code execution errors
                 that are not possible in continuously-powered systems
                 and cannot be diagnosed with conventional debugging
                 tools such as JTAG and/or oscilloscopes. We propose the
                 Energy-interference-free Debugger, a hardware and
                 software platform for monitoring and debugging
                 intermittent systems without adversely effecting their
                 energy state. The Energy-interference-free Debugger
                 re-creates a familiar debugging environment for
                 intermittent software and augments it with debugging
                 primitives for effective diagnosis of intermittence
                 bugs. Our evaluation of the Energy-interference-free
                 Debugger quantifies its energy-interference-freedom and
                 shows its value in a set of debugging tasks in complex
                 test programs and several real applications, including
                 RFID code and a machine-learning-based activity
                 recognition system.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'16 conference proceedings.",
}

@Article{Witchel:2016:PPW,
  author =       "Emmett Witchel",
  title =        "Programmer Productivity in a World of Mushy
                 Interfaces: Challenges of the Post-{ISA} Reality",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "2",
  pages =        "591--591",
  month =        may,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2980024.2876511",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:42 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Since 1964, we had the notion that the instruction set
                 architecture (ISA) is a useful and fairly opaque
                 abstraction layer between hardware and software.
                 Software rode hardware's performance wave while
                 remaining gloriously oblivious to hardware's growing
                 complexity. Unfortunately, the jig is up. We still have
                 ISAs, but the abstraction no longer offers seamless
                 portability---parallel software needs to be tuned for
                 different core counts, and heterogeneous processing
                 elements (CPUs, GPUs, accelerators) further complicate
                 programmability. We are better at building large-scale
                 heterogeneous processors than we are at programming
                 them. Maintaining software across multiple current
                 platforms is difficult and porting to future platforms
                 is also difficult. There have been many technical
                 responses: virtual ISAs (e.g., NVIDIA's PTX),
                 higher-level programming interfaces (e.g., CUDA or
                 OpenCL), and late-stage compilation and
                 platform-specific tailoring (e.g., Android ART), etc. A
                 team of opinionated experts, drawn from the three
                 ASPLOS communities will examine the problem of
                 programmer productivity in the post-ISA world, first
                 from the perspective of their area of expertise and
                 then noting the contributions from the other two
                 communities. What research will save us and how? This
                 wide-ranging debate will frame important research areas
                 for future work while being grounded in frank
                 discussion about what has succeeded in the past.
                 Attendees can expect actionable insight into important
                 research issues as well an entertaining discussion.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'16 conference proceedings.",
}

@Article{Angstadt:2016:RPP,
  author =       "Kevin Angstadt and Westley Weimer and Kevin Skadron",
  title =        "{RAPID} Programming of Pattern-Recognition
                 Processors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "2",
  pages =        "593--605",
  month =        may,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2980024.2872393",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:42 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "We present RAPID, a high-level programming language
                 and combined imperative and declarative model for
                 programming pattern-recognition processors, such as
                 Micron's Automata Processor (AP). The AP is a novel,
                 non-Von Neumann architecture for direct execution of
                 non-deterministic finite automata (NFAs), and has been
                 demonstrated to provide substantial speedup for a
                 variety of data-processing applications. RAPID is
                 clear, maintainable, concise, and efficient both at
                 compile and run time. Language features, such as code
                 abstraction and parallel control structures, map well
                 to pattern-matching problems, providing clarity and
                 maintainability. For generation of efficient runtime
                 code, we present algorithms to convert RAPID programs
                 into finite automata. Further, we introduce a
                 tessellation technique for configuring the AP, which
                 significantly reduces compile time, increases
                 programmer productivity, and improves maintainability.
                 We evaluate five RAPID programs against custom,
                 baseline implementations previously demonstrated to be
                 significantly accelerated by the AP. We find that RAPID
                 programs are much shorter in length, are expressible at
                 a higher level of abstraction than their handcrafted
                 counterparts, and yield generated code that is often
                 more compact. In addition, our tessellation technique
                 for configuring the AP has comparable device
                 utilization to, and results in compilation that is up
                 to four orders of magnitude faster than, current
                 solutions.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'16 conference proceedings.",
}

@Article{Sui:2016:PCA,
  author =       "Xin Sui and Andrew Lenharth and Donald S. Fussell and
                 Keshav Pingali",
  title =        "Proactive Control of Approximate Programs",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "2",
  pages =        "607--621",
  month =        may,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2980024.2872402",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:42 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Approximate computing trades off accuracy of results
                 for resources such as energy or computing time. There
                 is a large and rapidly growing literature on
                 approximate computing that has focused mostly on
                 showing the benefits of approximate computing. However,
                 we know relatively little about how to control
                 approximation in a disciplined way. In this paper, we
                 address the problem of controlling approximation for
                 non-streaming programs that have a set of ``knobs''
                 that can be dialed up or down to control the level of
                 approximation of different components in the program.
                 We formulate this control problem as a constrained
                 optimization problem, and describe a system called
                 Capri that uses machine learning to learn cost and
                 error models for the program, and uses these models to
                 determine, for a desired level of approximation, knob
                 settings that optimize metrics such as running time or
                 energy usage. Experimental results with complex
                 benchmarks from different problem domains demonstrate
                 the effectiveness of this approach.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'16 conference proceedings.",
}

@Article{Park:2016:ATC,
  author =       "Jongse Park and Emmanuel Amaro and Divya Mahajan and
                 Bradley Thwaites and Hadi Esmaeilzadeh",
  title =        "{AxGames}: Towards Crowdsourcing Quality Target
                 Determination in Approximate Computing",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "2",
  pages =        "623--636",
  month =        may,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2980024.2872376",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:42 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Approximate computing trades quality of application
                 output for higher efficiency and performance.
                 Approximation is useful only if its impact on
                 application output quality is acceptable to the users.
                 However, there is a lack of systematic solutions and
                 studies that explore users' perspective on the effects
                 of approximation. In this paper, we seek to provide one
                 such solution for the developers to probe and discover
                 the boundary of quality loss that most users will deem
                 acceptable. We propose AxGames, a crowdsourced solution
                 that enables developers to readily infer a statistical
                 common ground from the general public through three
                 entertaining games. The users engage in these games by
                 betting on their opinion about the quality loss of the
                 final output while the AxGames framework collects
                 statistics about their perceptions. The framework then
                 statistically analyzes the results to determine the
                 acceptable levels of quality for a pair of
                 (application, approximation technique). The three games
                 are designed such that they effectively capture quality
                 requirements with various tradeoffs and contexts. To
                 evaluate AxGames, we examine seven diverse applications
                 that produce user perceptible outputs and cover a wide
                 range of domains, including image processing, optical
                 character recognition, speech to text conversion, and
                 audio processing. We recruit 700 participants/users
                 through Amazon's Mechanical Turk to play the games that
                 collect statistics about their perception on different
                 levels of quality. Subsequently, the AxGames framework
                 uses the Clopper-Pearson exact method, which computes a
                 binomial proportion confidence interval, to analyze the
                 collected statistics for each level of quality. Using
                 this analysis, AxGames can statistically project the
                 quality level that satisfies a given percentage of
                 users. The developers can use these statistical
                 projections to tune the level of approximation based on
                 the user experience. We find that the level of
                 acceptable quality loss significantly varies across
                 applications. For instance, to satisfy 90\% of users,
                 the level of acceptable quality loss is 2\% for one
                 application (image processing) and 26\% for another
                 (audio processing). Moreover, the pattern with which
                 the crowd responds to approximation takes significantly
                 different shape and form depending on the class of
                 applications. These results confirm the necessity of
                 solutions that systematically explore the effect of
                 approximation on the end user experience.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'16 conference proceedings.",
}

@Article{Bornholt:2016:DBA,
  author =       "James Bornholt and Randolph Lopez and Douglas M.
                 Carmean and Luis Ceze and Georg Seelig and Karin
                 Strauss",
  title =        "A {DNA}-Based Archival Storage System",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "2",
  pages =        "637--649",
  month =        may,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2980024.2872397",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:42 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Demand for data storage is growing exponentially, but
                 the capacity of existing storage media is not keeping
                 up. Using DNA to archive data is an attractive
                 possibility because it is extremely dense, with a raw
                 limit of 1 exabyte/mm$^3$ (109 GB/mm$^3$ ), and
                 long-lasting, with observed half-life of over 500
                 years. This paper presents an architecture for a
                 DNA-based archival storage system. It is structured as
                 a key-value store, and leverages common biochemical
                 techniques to provide random access. We also propose a
                 new encoding scheme that offers controllable
                 redundancy, trading off reliability for density. We
                 demonstrate feasibility, random access, and robustness
                 of the proposed encoding with wet lab experiments
                 involving 151 kB of synthesized DNA and a 42 kB
                 random-access subset, and simulation experiments of
                 larger sets calibrated to the wet lab experiments.
                 Finally, we highlight trends in biotechnology that
                 indicate the impending practicality of DNA storage for
                 much larger datasets.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'16 conference proceedings.",
}

@Article{Prabhakar:2016:GCH,
  author =       "Raghu Prabhakar and David Koeplinger and Kevin J.
                 Brown and HyoukJoong Lee and Christopher {De Sa} and
                 Christos Kozyrakis and Kunle Olukotun",
  title =        "Generating Configurable Hardware from Parallel
                 Patterns",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "2",
  pages =        "651--665",
  month =        may,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2980024.2872415",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:42 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "In recent years the computing landscape has seen an
                 increasing shift towards specialized accelerators.
                 Field programmable gate arrays (FPGAs) are particularly
                 promising for the implementation of these accelerators,
                 as they offer significant performance and energy
                 improvements over CPUs for a wide class of applications
                 and are far more flexible than fixed-function ASICs.
                 However, FPGAs are difficult to program. Traditional
                 programming models for reconfigurable logic use
                 low-level hardware description languages like Verilog
                 and VHDL, which have none of the productivity features
                 of modern software languages but produce very efficient
                 designs, and low-level software languages like C and
                 OpenCL coupled with high-level synthesis (HLS) tools
                 that typically produce designs that are far less
                 efficient. Functional languages with parallel patterns
                 are a better fit for hardware generation because they
                 provide high-level abstractions to programmers with
                 little experience in hardware design and avoid many of
                 the problems faced when generating hardware from
                 imperative languages. In this paper, we identify two
                 important optimizations for using parallel patterns to
                 generate efficient hardware: tiling and metapipelining.
                 We present a general representation of tiled parallel
                 patterns, and provide rules for automatically tiling
                 patterns and generating metapipelines. We demonstrate
                 experimentally that these optimizations result in
                 speedups up to 39.4$ \times $ on a set of benchmarks
                 from the data analytics domain.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'16 conference proceedings.",
}

@Article{Chang:2016:DLD,
  author =       "Li-Wen Chang and Hee-Seok Kim and Wen-mei W. Hwu",
  title =        "{DySel}: Lightweight Dynamic Selection for
                 Kernel-based Data-parallel Programming Model",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "2",
  pages =        "667--680",
  month =        may,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2980024.2872373",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:42 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "The rising pressure for simultaneously improving
                 performance and reducing power is driving more
                 diversity into all aspects of computing devices. An
                 algorithm that is well-matched to the target hardware
                 can run multiple times faster and more energy
                 efficiently than one that is not. The problem is
                 complicated by the fact that a program's input also
                 affects the appropriate choice of algorithm. As a
                 result, software developers have been faced with the
                 challenge of determining the appropriate algorithm for
                 each potential combination of target device and data.
                 This paper presents DySel, a novel runtime system for
                 automating such determination for kernel-based data
                 parallel programming models such as OpenCL, CUDA,
                 OpenACC, and C++AMP. These programming models cover
                 many applications that demand high performance in
                 mobile, cloud and high-performance computing. DySel
                 systematically deploys candidate kernels on a small
                 portion of the actual data to determine which achieves
                 the best performance for the hardware-data combination.
                 The test-deployment, referred to as micro-profiling,
                 contributes to the final execution result and incurs
                 less than 8\% of overhead in the worst observed case
                 when compared to an oracle. We show four major use
                 cases where DySel provides significantly more
                 consistent performance without tedious effort from the
                 developer.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'16 conference proceedings.",
}

@Article{Chen:2016:BQA,
  author =       "Quan Chen and Hailong Yang and Jason Mars and Lingjia
                 Tang",
  title =        "{Baymax}: {QoS} Awareness and Increased Utilization
                 for Non-Preemptive Accelerators in Warehouse Scale
                 Computers",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "2",
  pages =        "681--696",
  month =        may,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2980024.2872368",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:42 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Modern warehouse-scale computers (WSCs) are being
                 outfitted with accelerators to provide the significant
                 compute required by emerging intelligent personal
                 assistant (IPA) workloads such as voice recognition,
                 image classification, and natural language processing.
                 It is well known that the diurnal user access pattern
                 of user-facing services provides a strong incentive to
                 co-locate applications for better accelerator
                 utilization and efficiency, and prior work has focused
                 on enabling co-location on multicore processors.
                 However, interference when co-locating applications on
                 non-preemptive accelerators is fundamentally different
                 than contention on multi-core CPUs and introduces a new
                 set of challenges to reduce QoS violation. To address
                 this open problem, we first identify the underlying
                 causes for QoS violation in accelerator-outfitted
                 servers. Our experiments show that queuing delay for
                 the compute resources and PCI-e bandwidth contention
                 for data transfer are the main two factors that
                 contribute to the long tails of user-facing
                 applications. We then present Baymax, a runtime system
                 that orchestrates the execution of compute tasks from
                 different applications and mitigates PCI-e bandwidth
                 contention to deliver the required QoS for user-facing
                 applications and increase the accelerator utilization.
                 Using DjiNN, a deep neural network service, Sirius, an
                 end-to-end IPA workload, and traditional applications
                 on a Nvidia K40 GPU, our evaluation shows that Baymax
                 improves the accelerator utilization by 91.3\% while
                 achieving the desired 99\%-ile latency target for for
                 user-facing applications. In fact, Baymax reduces the
                 99\%-ile latency of user-facing applications by up to
                 195x over default execution.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'16 conference proceedings.",
}

@Article{Nowatzki:2016:ABS,
  author =       "Tony Nowatzki and Karthikeyan Sankaralingam",
  title =        "Analyzing Behavior Specialized Acceleration",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "2",
  pages =        "697--711",
  month =        may,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2980024.2872412",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:42 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Hardware specialization has become a promising
                 paradigm for overcoming the inefficiencies of general
                 purpose microprocessors. Of significant interest are
                 Behavioral Specialized Accelerators (BSAs), which are
                 designed to efficiently execute code with only certain
                 properties, but remain largely configurable or
                 programmable. The most important strength of BSAs ---
                 their ability to target a wide variety of codes ---
                 also makes their interactions and analysis complex,
                 raising the following questions: can multiple BSAs be
                 composed synergistically, what are their interactions
                 with the general purpose core, and what combinations
                 favor which workloads? From a methodological
                 standpoint, BSAs are also challenging, as they each
                 require ISA development, compiler and assembler
                 extensions, and either simulator or RTL models. To
                 study the potential of BSAs, we propose a novel
                 modeling technique called the Transformable Dependence
                 Graph (TDG) --- a higher level alternative to the
                 time-consuming traditional compiler+simulator approach,
                 while still enabling detailed microarchitectural models
                 for both general cores and accelerators. We then
                 propose a multi-BSA organization, called ExoCore, which
                 we model and study using the TDG. A design space
                 exploration reveals that an ExoCore organization can
                 push designs beyond the established energy-performance
                 frontiers for general purpose cores. For example, a
                 2-wide OOO processor with three BSAs matches the
                 performance of a conventional 6-wide OOO core, has 40\%
                 lower area, and is 2.6x more energy efficient.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'16 conference proceedings.",
}

@Article{Yoon:2016:PPI,
  author =       "Man-Ki Yoon and Negin Salajegheh and Yin Chen and
                 Mihai Christodorescu",
  title =        "{PIFT}: Predictive Information-Flow Tracking",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "2",
  pages =        "713--725",
  month =        may,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2980024.2872403",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:42 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Phones today carry sensitive information and have a
                 great number of ways to communicate that data. As a
                 result, malware that steal money, information, or
                 simply disable functionality have hit the app stores.
                 Current security solutions for preventing undesirable
                 data leaks are mostly high-overhead and have not been
                 practical enough for smartphones. In this paper, we
                 show that simply monitoring just some instructions
                 (only memory loads and stores) it is possible to
                 achieve low overhead, highly accurate information flow
                 tracking. Our method achieves 98\% accuracy (0\% false
                 positive and 2\% false negative) over DroidBench and
                 was able to successfully catch seven real-world malware
                 instances that steal phone number, location, and device
                 ID using SMS messages and HTTP connections.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'16 conference proceedings.",
}

@Article{Venkat:2016:HHI,
  author =       "Ashish Venkat and Sriskanda Shamasunder and Hovav
                 Shacham and Dean M. Tullsen",
  title =        "{HIPStR}: Heterogeneous-{ISA} Program State
                 Relocation",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "2",
  pages =        "727--741",
  month =        may,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2980024.2872408",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:42 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Heterogeneous Chip Multiprocessors have been shown to
                 provide significant performance and energy efficiency
                 gains over homogeneous designs. Recent research has
                 expanded the dimensions of heterogeneity to include
                 diverse Instruction Set Architectures, called
                 Heterogeneous-ISA Chip Multiprocessors. This work
                 leverages such an architecture to realize substantial
                 new security benefits, and in particular, to thwart
                 Return-Oriented Programming. This paper proposes a
                 novel security defense called HIPStR ---
                 Heterogeneous-ISA Program State Relocation --- that
                 performs dynamic randomization of run-time program
                 state, both within and across ISAs. This technique
                 outperforms the state-of-the-art just-in-time code
                 reuse (JIT-ROP) defense by an average of 15.6\%, while
                 simultaneously providing greater security guarantees
                 against classic return-into-libc, ROP, JOP, brute
                 force, JIT-ROP, and several evasive variants.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'16 conference proceedings.",
}

@Article{Aweke:2016:ASB,
  author =       "Zelalem Birhanu Aweke and Salessawi Ferede Yitbarek
                 and Rui Qiao and Reetuparna Das and Matthew Hicks and
                 Yossi Oren and Todd Austin",
  title =        "{ANVIL}: Software-Based Protection Against
                 Next-Generation Rowhammer Attacks",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "2",
  pages =        "743--755",
  month =        may,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2980024.2872390",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:42 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Ensuring the integrity and security of the memory
                 system is critical. Recent studies have shown serious
                 security concerns due to ``rowhammer'' attacks, where
                 repeated accesses to a row of memory cause bit flips in
                 adjacent rows. Recent work by Google's Project Zero has
                 shown how to leverage rowhammer-induced bit-flips as
                 the basis for security exploits that include malicious
                 code injection and memory privilege escalation. Being
                 an important security concern, industry has attempted
                 to defend against rowhammer attacks. Deployed defenses
                 employ two strategies: (1) doubling the system DRAM
                 refresh rate and (2) restricting access to the CLFLUSH
                 instruction that attackers use to bypass the cache to
                 increase memory access frequency (i.e., the rate of
                 rowhammering). We demonstrate that such defenses are
                 inadequate: we implement rowhammer attacks that both
                 avoid using the CLFLUSH instruction and cause bit flips
                 with a doubled refresh rate. Our next-generation
                 CLFLUSH-free rowhammer attack bypasses the cache by
                 manipulating cache replacement state to allow frequent
                 misses out of the last-level cache to DRAM rows of our
                 choosing. To protect existing systems from more
                 advanced rowhammer attacks, we develop a software-based
                 defense, ANVIL, which thwarts all known rowhammer
                 attacks on existing systems. ANVIL detects rowhammer
                 attacks by tracking the locality of DRAM accesses using
                 existing hardware performance counters. Our detector
                 identifies the rows being frequently accessed (i.e.,
                 the aggressors), then selectively refreshes the nearby
                 victim rows to prevent hammering. Experiments running
                 on real hardware with the SPEC2006 benchmarks show that
                 ANVIL has less than a 1\% false positive rate and an
                 average slowdown of 1\%. ANVIL is low-cost and robust,
                 and our experiments indicate that it is an effective
                 approach for protecting existing and future systems
                 from even advanced rowhammer attacks.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'16 conference proceedings.",
}

@Article{Didona:2016:PAM,
  author =       "Diego Didona and Nuno Diegues and Anne-Marie Kermarrec
                 and Rachid Guerraoui and Ricardo Neves and Paolo
                 Romano",
  title =        "{ProteusTM}: Abstraction Meets Performance in
                 Transactional Memory",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "2",
  pages =        "757--771",
  month =        may,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2980024.2872385",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:42 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "The Transactional Memory (TM) paradigm promises to
                 greatly simplify the development of concurrent
                 applications. This led, over the years, to the creation
                 of a plethora of TM implementations delivering wide
                 ranges of performance across workloads. Yet, no
                 universal implementation fits each and every workload.
                 In fact, the best TM in a given workload can reveal to
                 be disastrous for another one. This forces developers
                 to face the complex task of tuning TM implementations,
                 which significantly hampers their wide adoption. In
                 this paper, we address the challenge of automatically
                 identifying the best TM implementation for a given
                 workload. Our proposed system, ProteusTM, hides behind
                 the TM interface a large library of implementations.
                 Underneath, it leverages a novel multi-dimensional
                 online optimization scheme, combining two popular
                 learning techniques: Collaborative Filtering and
                 Bayesian Optimization. We integrated ProteusTM in GCC
                 and demonstrate its ability to switch between TMs and
                 adapt several configuration parameters (e.g., number of
                 threads). We extensively evaluated ProteusTM, obtaining
                 average performance {$<$3}\% from optimal, and gains up
                 to 100x over static alternatives.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'16 conference proceedings.",
}

@Article{Shalev:2016:CCS,
  author =       "Noam Shalev and Eran Harpaz and Hagar Porat and Idit
                 Keidar and Yaron Weinsberg",
  title =        "{CSR}: Core Surprise Removal in Commodity Operating
                 Systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "2",
  pages =        "773--787",
  month =        may,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2980024.2872369",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:42 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "One of the adverse effects of shrinking transistor
                 sizes is that processors have become increasingly prone
                 to hardware faults. At the same time, the number of
                 cores per die rises. Consequently, core failures can no
                 longer be ruled out, and future operating systems for
                 many-core machines will have to incorporate fault
                 tolerance mechanisms. We present CSR, a strategy for
                 recovery from unexpected permanent processor faults in
                 commodity operating systems. Our approach overcomes
                 surprise removal of faulty cores, and also tolerates
                 cascading core failures. When a core fails in user
                 mode, CSR terminates the process executing on that core
                 and migrates the remaining processes in its run-queue
                 to other cores. We further show how hardware
                 transactional memory may be used to overcome failures
                 in critical kernel code. Our solution is scalable,
                 incurs low overhead, and is designed to integrate into
                 modern operating systems. We have implemented it in the
                 Linux kernel, using Haswell's Transactional
                 Synchronization Extension, and tested it on a real
                 system.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'16 conference proceedings.",
}

@Article{Gangwani:2016:CBS,
  author =       "Tanmay Gangwani and Adam Morrison and Josep
                 Torrellas",
  title =        "{CASPAR}: Breaking Serialization in Lock-Free
                 Multicore Synchronization",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "2",
  pages =        "789--804",
  month =        may,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/2980024.2872400",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:42 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "In multicores, performance-critical synchronization is
                 increasingly performed in a lock-free manner using
                 atomic instructions such as CAS or LL/SC. However, when
                 many processors synchronize on the same variable,
                 performance can still degrade significantly. Contending
                 writes get serialized, creating a non-scalable
                 condition. Past proposals that build hardware queues of
                 synchronizing processors do not fundamentally solve
                 this problem---at best, they help to efficiently
                 serialize the contending writes. This paper proposes a
                 novel architecture that breaks the serialization of
                 hardware queues and enables the queued processors to
                 perform lock-free synchronization in parallel. The
                 architecture, called CASPAR, is able to (1) execute the
                 CASes in the queued-up processors in parallel through
                 eager forwarding of expected values, and (2) validate
                 the CASes in parallel and dequeue groups of processors
                 at a time. The result is highly-scalable
                 synchronization. We evaluate CASPAR with simulations of
                 a 64-core chip. Compared to existing proposals with
                 hardware queues, CASPAR improves the throughput of
                 kernels by 32\% on average, and reduces the execution
                 time of the sections considered in lock-free versions
                 of applications by 47\% on average. This makes these
                 sections 2.5x faster than in the original
                 applications.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'16 conference proceedings.",
}

@Article{Albericio:2016:CIN,
  author =       "Jorge Albericio and Patrick Judd and Tayler
                 Hetherington and Tor Aamodt and Natalie Enright Jerger
                 and Andreas Moshovos",
  title =        "{Cnvlutin}: ineffectual-neuron-free deep neural
                 network computing",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "3",
  pages =        "1--13",
  month =        jun,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3007787.3001138",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:43 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "This work observes that a large fraction of the
                 computations performed by Deep Neural Networks (DNNs)
                 are intrinsically ineffectual as they involve a
                 multiplication where one of the inputs is zero. This
                 observation motivates Cnvlutin ( CNV ), a value-based
                 approach to hardware acceleration that eliminates most
                 of these ineffectual operations, improving performance
                 and energy over a state-of-the-art accelerator with no
                 accuracy loss. CNV uses hierarchical data-parallel
                 units, allowing groups of lanes to proceed mostly
                 independently enabling them to skip over the
                 ineffectual computations. A co-designed data storage
                 format encodes the computation elimination decisions
                 taking them off the critical path while avoiding
                 control divergence in the data parallel units.
                 Combined, the units and the data storage format result
                 in a data-parallel architecture that maintains wide,
                 aligned accesses to its memory hierarchy and that keeps
                 its data lanes busy. By loosening the ineffectual
                 computation identification criterion, CNV enables
                 further performance and energy efficiency improvements,
                 and more so if a loss in accuracy is acceptable.
                 Experimental measurements over a set of
                 state-of-the-art DNNs for image classification show
                 that CNV improves performance over a state-of-the-art
                 accelerator from 1.24$ \times $ to 1.55$ \times $ and
                 by 1.37$ \times $ on average without any loss in
                 accuracy by removing zero-valued operand
                 multiplications alone. While CNV incurs an area
                 overhead of 4.49\%, it improves overall EDP (Energy
                 Delay Product) and ED$^2$ P (Energy Delay Squared
                 Product) on average by 1.47$ \times $ and 2.01$ \times
                 $, respectively. The average performance improvements
                 increase to 1.52$ \times $ without any loss in accuracy
                 with a broader ineffectual identification policy.
                 Further improvements are demonstrated with a loss in
                 accuracy.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '16 conference proceedings.",
}

@Article{Shafiee:2016:ICN,
  author =       "Ali Shafiee and Anirban Nag and Naveen Muralimanohar
                 and Rajeev Balasubramonian and John Paul Strachan and
                 Miao Hu and R. Stanley Williams and Vivek Srikumar",
  title =        "{ISAAC}: a convolutional neural network accelerator
                 with in-situ analog arithmetic in crossbars",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "3",
  pages =        "14--26",
  month =        jun,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3007787.3001139",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:43 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "A number of recent efforts have attempted to design
                 accelerators for popular machine learning algorithms,
                 such as those involving convolutional and deep neural
                 networks (CNNs and DNNs). These algorithms typically
                 involve a large number of multiply-accumulate
                 (dot-product) operations. A recent project, DaDianNao,
                 adopts a near data processing approach, where a
                 specialized neural functional unit performs all the
                 digital arithmetic operations and receives input
                 weights from adjacent eDRAM banks. This work explores
                 an in-situ processing approach, where memristor
                 crossbar arrays not only store input weights, but are
                 also used to perform dot-product operations in an
                 analog manner. While the use of crossbar memory as an
                 analog dot-product engine is well known, no prior work
                 has designed or characterized a full-fledged
                 accelerator based on crossbars. In particular, our work
                 makes the following contributions: (i) We design a
                 pipelined architecture, with some crossbars dedicated
                 for each neural network layer, and eDRAM buffers that
                 aggregate data between pipeline stages. (ii) We define
                 new data encoding techniques that are amenable to
                 analog computations and that can reduce the high
                 overheads of analog-to-digital conversion (ADC). (iii)
                 We define the many supporting digital components
                 required in an analog CNN accelerator and carry out a
                 design space exploration to identify the best balance
                 of memristor storage/compute, ADCs, and eDRAM storage
                 on a chip. On a suite of CNN and DNN workloads, the
                 proposed ISAAC architecture yields improvements of
                 14.8$ \times $, 5.5$ \times $, and 7.5$ \times $ in
                 throughput, energy, and computational density
                 (respectively), relative to the state-of-the-art
                 DaDianNao architecture.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '16 conference proceedings.",
}

@Article{Chi:2016:PNP,
  author =       "Ping Chi and Shuangchen Li and Cong Xu and Tao Zhang
                 and Jishen Zhao and Yongpan Liu and Yu Wang and Yuan
                 Xie",
  title =        "{PRIME}: a novel processing-in-memory architecture for
                 neural network computation in {ReRAM}-based main
                 memory",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "3",
  pages =        "27--39",
  month =        jun,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3007787.3001140",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:43 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Processing-in-memory (PIM) is a promising solution to
                 address the ``memory wall'' challenges for future
                 computer systems. Prior proposed PIM architectures put
                 additional computation logic in or near memory. The
                 emerging metal-oxide resistive random access memory
                 (ReRAM) has showed its potential to be used for main
                 memory. Moreover, with its crossbar array structure,
                 ReRAM can perform matrix-vector multiplication
                 efficiently, and has been widely studied to accelerate
                 neural network (NN) applications. In this work, we
                 propose a novel PIM architecture, called PRIME, to
                 accelerate NN applications in ReRAM based main memory.
                 In PRIME, a portion of ReRAM crossbar arrays can be
                 configured as accelerators for NN applications or as
                 normal memory for a larger memory space. We provide
                 microarchitecture and circuit designs to enable the
                 morphable functions with an insignificant area
                 overhead. We also design a software/hardware interface
                 for software developers to implement various NNs on
                 PRIME. Benefiting from both the PIM architecture and
                 the efficiency of using ReRAM for NN computation, PRIME
                 distinguishes itself from prior work on NN
                 acceleration, with significant performance improvement
                 and energy saving. Our experimental results show that,
                 compared with a state-of-the-art neural processing unit
                 design, PRIME improves the performance by ~2360$ \times
                 $ and the energy consumption by ~895$ \times $, across
                 the evaluated machine learning benchmarks.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '16 conference proceedings.",
}

@Article{Torng:2016:AAW,
  author =       "Christopher Torng and Moyang Wang and Christopher
                 Batten",
  title =        "Asymmetry-aware work-stealing runtimes",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "3",
  pages =        "40--52",
  month =        jun,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3007787.3001142",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:43 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Amdahl's law provides architects a compelling reason
                 to introduce system asymmetry to optimize for both
                 serial and parallel regions of execution. Asymmetry in
                 a multicore processor can arise statically (e.g., from
                 core microarchitecture) or dynamically (e.g., applying
                 dynamic voltage/frequency scaling). Work stealing is an
                 increasingly popular approach to task distribution that
                 elegantly balances task-based parallelism across
                 multiple worker threads. In this paper, we propose
                 asymmetry-aware work-stealing (AAWS) runtimes, which
                 are carefully designed to exploit both the static and
                 dynamic asymmetry in modern systems. AAWS runtimes use
                 three key hardware/software techniques: work-pacing,
                 work-sprinting, and work-mugging. Work-pacing and
                 work-sprinting are novel techniques that combine a
                 marginal-utility-based approach with integrated voltage
                 regulators to improve performance and energy efficiency
                 in high- and low-parallel regions. Work-mugging is a
                 previously proposed technique that enables a waiting
                 big core to preemptively migrate work from a busy
                 little core. We propose a simple implementation of
                 work-mugging based on lightweight user-level
                 interrupts. We use a vertically integrated research
                 methodology spanning software, architecture, and VLSI
                 to make the case that holistically combining static
                 asymmetry, dynamic asymmetry, and work-stealing
                 runtimes can improve both performance and energy
                 efficiency in future multicore systems.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '16 conference proceedings.",
}

@Article{Tseng:2016:MCA,
  author =       "Hung-Wei Tseng and Qianchen Zhao and Yuxiao Zhou and
                 Mark Gahagan and Steven Swanson",
  title =        "{Morpheus}: creating application objects efficiently
                 for heterogeneous computing",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "3",
  pages =        "53--65",
  month =        jun,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3007787.3001143",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:43 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "In high performance computing systems, object
                 deserialization can become a surprisingly important
                 bottleneck---in our test, a set of general-purpose,
                 highly parallelized applications spends 64\% of total
                 execution time deserializing data into objects. This
                 paper presents the Morpheus model, which allows
                 applications to move such computations to a storage
                 device. We use this model to deserialize data into
                 application objects inside storage devices, rather than
                 in the host CPU. Using the Morpheus model for object
                 deserialization avoids unnecessary system overheads,
                 frees up scarce CPU and main memory resources for
                 compute-intensive workloads, saves I/O bandwidth, and
                 reduces power consumption. In heterogeneous,
                 co-processor-equipped systems, Morpheus allows
                 application objects to be sent directly from a storage
                 device to a co-processor (e.g., a GPU) by peer-to-peer
                 transfer, further improving application performance as
                 well as reducing the CPU and main memory utilizations.
                 This paper implements Morpheus-SSD, an SSD supporting
                 the Morpheus model. Morpheus-SSD improves the
                 performance of object deserialization by 1.66$ \times
                 $, reduces power consumption by 7\%, uses 42\% less
                 energy, and speeds up the total execution time by 1.32$
                 \times $. By using NVMe-P2P that realizes peer-to-peer
                 communication between Morpheus-SSD and a GPU,
                 Morpheus-SSD can speed up the total execution time by
                 1.39$ \times $ in a heterogeneous computing platform.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '16 conference proceedings.",
}

@Article{Mahajan:2016:TSG,
  author =       "Divya Mahajan and Amir Yazdanbakhsh and Jongse Park
                 and Bradley Thwaites and Hadi Esmaeilzadeh",
  title =        "Towards statistical guarantees in controlling quality
                 tradeoffs for approximate acceleration",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "3",
  pages =        "66--77",
  month =        jun,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3007787.3001144",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:43 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Conventionally, an approximate accelerator replaces
                 every invocation of a frequently executed region of
                 code without considering the final quality degradation.
                 However, there is a vast decision space in which each
                 invocation can either be delegated to the
                 accelerator---improving performance and efficiency--or
                 run on the precise core---maintaining quality. In this
                 paper we introduce Mithra, a co-designed
                 hardware-software solution, that navigates these
                 tradeoffs to deliver high performance and efficiency
                 while lowering the final quality loss. Mithra seeks to
                 identify whether each individual accelerator invocation
                 will lead to an undesirable quality loss and, if so,
                 directs the processor to run the original precise code.
                 This identification is cast as a binary classification
                 task that requires a cohesive co-design of hardware and
                 software. The hardware component performs the
                 classification at runtime and exposes a knob to the
                 software mechanism to control quality tradeoffs. The
                 software tunes this knob by solving a statistical
                 optimization problem that maximizes benefits from
                 approximation while providing statistical guarantees
                 that final quality level will be met with high
                 confidence. The software uses this knob to tune and
                 train the hardware classifiers. We devise two distinct
                 hardware classifiers, one table-based and one neural
                 network based. To understand the efficacy of these
                 mechanisms, we compare them with an ideal, but
                 infeasible design, the oracle. Results show that, with
                 95\% confidence the table-based design can restrict the
                 final output quality loss to 5\% for 90\% of unseen
                 input sets while providing 2.5$ \times $ speedup and
                 2.6$ \times $ energy efficiency. The neural design
                 shows similar speedup however, improves the efficiency
                 by 13\%. Compared to the table-based design, the oracle
                 improves speedup by 26\% and efficiency by 36\%. These
                 results show that Mithra performs within a close range
                 of the oracle and can effectively navigate the quality
                 tradeoffs in approximate acceleration.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '16 conference proceedings.",
}

@Article{Jain:2016:BFL,
  author =       "Akanksha Jain and Calvin Lin",
  title =        "Back to the future: leveraging {Belady}'s algorithm
                 for improved cache replacement",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "3",
  pages =        "78--89",
  month =        jun,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3007787.3001146",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:43 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Belady's algorithm is optimal but infeasible because
                 it requires knowledge of the future. This paper
                 explains how a cache replacement algorithm can
                 nonetheless learn from Belady's algorithm by applying
                 it to past cache accesses to inform future cache
                 replacement decisions. We show that the implementation
                 is surprisingly efficient, as we introduce a new method
                 of efficiently simulating Belady's behavior, and we use
                 known sampling techniques to compactly represent the
                 long history information that is needed for high
                 accuracy. For a 2MB LLC, our solution uses a 16KB
                 hardware budget (excluding replacement state in the tag
                 array). When applied to a memory-intensive subset of
                 the SPEC 2006 CPU benchmarks, our solution improves
                 performance over LRU by 8.4\%, as opposed to 6.2\% for
                 the previous state-of-the-art. For a 4-core system with
                 a shared 8MB LLC, our solution improves performance by
                 15.0\%, compared to 12.0\% for the previous
                 state-of-the-art.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '16 conference proceedings.",
}

@Article{Park:2016:ESFa,
  author =       "Caching Hyun Park and Taekyung Heo and Jaehyuk Huh",
  title =        "Efficient synonym filtering and scalable delayed
                 translation for hybrid virtual",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "3",
  pages =        "90--102",
  month =        jun,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3007787.3001147",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:43 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Conventional translation look-aside buffers (TLBs) are
                 required to complete address translation with short
                 latencies, as the address translation is on the
                 critical path of all memory accesses even for L1 cache
                 hits. Such strict TLB latency restrictions limit the
                 TLB capacity, as the latency increase with large TLBs
                 may lower the overall performance even with potential
                 TLB miss reductions. Furthermore, TLBs consume a
                 significant amount of energy as they are accessed for
                 every instruction fetch and data access. To avoid the
                 latency restriction and reduce the energy consumption,
                 virtual caching techniques have been proposed to defer
                 translation to after L1 cache misses. However, an
                 efficient solution for the synonym problem has been a
                 critical issue hindering the wide adoption of virtual
                 caching. Based on the virtual caching concept, this
                 study proposes a hybrid virtual memory architecture
                 extending virtual caching to the entire cache
                 hierarchy, aiming to improve both performance and
                 energy consumption. The hybrid virtual caching uses
                 virtual addresses augmented with address space
                 identifiers (ASID) in the cache hierarchy for common
                 non-synonym addresses. For such non-synonyms, the
                 address translation occurs only after last-level cache
                 (LLC) misses. For uncommon synonym addresses, the
                 addresses are translated to physical addresses with
                 conventional TLBs before L1 cache accesses. To support
                 such hybrid translation, we propose an efficient
                 synonym detection mechanism based on Bloom filters
                 which can identify synonym candidates with few false
                 positives. For large memory applications, delayed
                 translation alone cannot solve the address translation
                 problem, as fixed-granularity delayed TLBs may not
                 scale with the increasing memory requirements. To
                 mitigate the translation scalability problem, this
                 study proposes a delayed many segment translation
                 designed for the hybrid virtual caching. The
                 experimental results show that our approach effectively
                 lowers accesses to the TLBs, leading to significant
                 power savings. In addition, the approach provides
                 performance improvement with scalable delayed
                 translation with variable length segments.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '16 conference proceedings.",
}

@Article{Cheng:2016:LLB,
  author =       "Hsiang-Yun Cheng and Jishen Zhao and Jack Sampson and
                 Mary Jane Irwin and Aamer Jaleel and Yu Lu and Yuan
                 Xie",
  title =        "{LAP}: loop-block aware inclusion properties for
                 energy-efficient asymmetric last level caches",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "3",
  pages =        "103--114",
  month =        jun,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3007787.3001148",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:43 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Emerging non-volatile memory (NVM) technologies, such
                 as spin-transfer torque RAM (STT-RAM), are attractive
                 options for replacing or augmenting SRAM in
                 implementing last-level caches (LLCs). However, the
                 asymmetric read/write energy and latency associated
                 with NVM introduces new challenges in designing caches
                 where, in contrast to SRAM, dynamic energy from write
                 operations can be responsible for a larger fraction of
                 total cache energy than leakage. These properties lead
                 to the fact that no single traditional inclusion policy
                 being dominant in terms of LLC energy consumption for
                 asymmetric LLCs. We propose a novel selective inclusion
                 policy, Loop-block-Aware Policy ( LAP ), to reduce
                 energy consumption in LLCs with asymmetric read/write
                 properties. In order to eliminate redundant writes to
                 the LLC, LAP incorporates advantages from both
                 non-inclusive and exclusive designs to selectively
                 cache only part of upper-level data in the LLC. Results
                 show that LAP outperforms other variants of selective
                 inclusion policies and consumes 20\% and 12\% less
                 energy than non-inclusive and exclusive STT-RAM-based
                 LLCs, respectively. We extend LAP to a system with
                 SRAM/STT-RAM hybrid LLCs to achieve energy-efficient
                 data placement, reducing the energy consumption by 22\%
                 and 15\% over non-inclusion and exclusion on average,
                 with average-case performance improvements, small
                 worst-case performance loss, and minimal hardware
                 overheads.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '16 conference proceedings.",
}

@Article{Koeplinger:2016:AGE,
  author =       "David Koeplinger and Christina Delimitrou and Raghu
                 Prabhakar and Christos Kozyrakis and Yaqi Zhang and
                 Kunle Olukotun",
  title =        "Automatic generation of efficient accelerators for
                 reconfigurable hardware",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "3",
  pages =        "115--127",
  month =        jun,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3007787.3001150",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:43 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Acceleration in the form of customized datapaths offer
                 large performance and energy improvements over general
                 purpose processors. Reconfigurable fabrics such as
                 FPGAs are gaining popularity for use in implementing
                 application-specific accelerators, thereby increasing
                 the importance of having good high-level FPGA design
                 tools. However, current tools for targeting FPGAs offer
                 inadequate support for high-level programming, resource
                 estimation, and rapid and automatic design space
                 exploration. We describe a design framework that
                 addresses these challenges. We introduce a new
                 representation of hardware using parameterized
                 templates that captures locality and parallelism
                 information at multiple levels of nesting. This
                 representation is designed to be automatically
                 generated from high-level languages based on parallel
                 patterns. We describe a hybrid area estimation
                 technique which uses template-level models and
                 design-level artificial neural networks to account for
                 effects from hardware place-and-route tools, including
                 routing overheads, register and block RAM duplication,
                 and LUT packing. Our runtime estimation accounts for
                 off-chip memory accesses. We use our estimation
                 capabilities to rapidly explore a large space of
                 designs across tile sizes, parallelization factors, and
                 optional coarse-grained pipelining, all at multiple
                 loop levels. We show that estimates average 4.8\% error
                 for logic resources, 6.1\% error for runtimes, and are
                 279 to 6533 times faster than a commercial high-level
                 synthesis tool. We compare the best-performing designs
                 to optimized CPU code running on a server-grade 6 core
                 processor and show speedups of up to 16.7$ \times $.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '16 conference proceedings.",
}

@Article{Kim:2016:SFA,
  author =       "Donggyu Kim and Adam Izraelevitz and Christopher Celio
                 and Hokeun Kim and Brian Zimmer and Yunsup Lee and
                 Jonathan Bachrach and Krste Asanovi{\'c}",
  title =        "{Strober}: fast and accurate sample-based energy
                 simulation for arbitrary {RTL}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "3",
  pages =        "128--139",
  month =        jun,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3007787.3001151",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:43 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "This paper presents a sample-based energy simulation
                 methodology that enables fast and accurate estimations
                 of performance and average power for arbitrary RTL
                 designs. Our approach uses an FPGA to simultaneously
                 simulate the performance of an RTL design and to
                 collect samples containing exact RTL state snapshots.
                 Each snapshot is then replayed in gate-level
                 simulation, resulting in a workload-specific average
                 power estimate with confidence intervals. For arbitrary
                 RTL and workloads, our methodology guarantees a minimum
                 of four-orders-of-magnitude speedup over commercial CAD
                 gate-level simulation tools and gives average energy
                 estimates guaranteed to be within 5\% of the true
                 average energy with 99\% confidence. We believe our
                 open-source sample-based energy simulation tool Strober
                 can not only rapidly provide ground truth for more
                 abstract power models, but can enable productive
                 design-space exploration early in the RTL design
                 process.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '16 conference proceedings.",
}

@Article{Laurenzano:2016:PIM,
  author =       "Michael A. Laurenzano and Yunqi Zhang and Jiang Chen
                 and Lingjia Tang and Jason Mars",
  title =        "{PowerChop}: identifying and managing non-critical
                 units in hybrid processor architectures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "3",
  pages =        "140--152",
  month =        jun,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3007787.3001152",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:43 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "On-core microarchitectural structures consume
                 significant portions of a processor's power budget.
                 However, depending on application characteristics,
                 those structures do not always provide (much)
                 performance benefit. While timeout-based power gating
                 techniques have been leveraged for underutilized cores
                 and inactive functional units, these techniques have
                 not directly translated to high-activity units such as
                 vector processing units, complex branch predictors, and
                 caches. The performance benefit provided by these units
                 does not necessarily correspond with unit activity, but
                 instead is a function of application characteristics.
                 This work introduces PowerChop, a novel technique that
                 leverages the unique capabilities of HW/SW co-designed
                 hybrid processors to enact unit-level power management
                 at the application phase level. PowerChop adds two
                 small additional hardware units to facilitate phase
                 identification and triggering different power states,
                 enabling the software layer to cheaply track, predict
                 and take advantage of varying unit criticality across
                 application phases by powering gating units that are
                 not needed for performant execution. Through detailed
                 experimentation, we find that PowerChop significantly
                 decreases power consumption, reducing the leakage power
                 of a hybrid server processor by 9\% on average (up to
                 33\%) and a hybrid mobile processor by 19\% (up to
                 40\%) while introducing just 2\% slowdown.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '16 conference proceedings.",
}

@Article{Gu:2016:BFN,
  author =       "Boncheol Gu and Andre S. Yoon and Duck-Ho Bae and
                 Insoon Jo and Jinyoung Lee and Jonghyun Yoon and
                 Jeong-Uk Kang and Moonsang Kwon and Chanho Yoon and
                 Sangyeun Cho and Jaeheon Jeong and Duckhyun Chang",
  title =        "{Biscuit}: a framework for near-data processing of big
                 data workloads",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "3",
  pages =        "153--165",
  month =        jun,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3007787.3001154",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:43 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Data-intensive queries are common in business
                 intelligence, data warehousing and analytics
                 applications. Typically, processing a query involves
                 full inspection of large in-storage data sets by CPUs.
                 An intuitive way to speed up such queries is to reduce
                 the volume of data transferred over the storage network
                 to a host system. This can be achieved by filtering out
                 extraneous data within the storage, motivating a form
                 of near-data processing. This work presents Biscuit, a
                 novel near-data processing framework designed for
                 modern solid-state drives. It allows programmers to
                 write a data-intensive application to run on the host
                 system and the storage system in a distributed, yet
                 seamless manner. In order to offer a high-level
                 programming model, Biscuit builds on the concept of
                 data flow. Data processing tasks communicate through
                 typed and data-ordered ports. Biscuit does not
                 distinguish tasks that run on the host system and the
                 storage system. As the result, Biscuit has desirable
                 traits like generality and expressiveness, while
                 promoting code reuse and naturally exposing
                 concurrency. We implement Biscuit on a host system that
                 runs the Linux OS and a high-performance solid-state
                 drive. We demonstrate the effectiveness of our approach
                 and implementation with experimental results. When data
                 filtering is done by hardware in the solid-state drive,
                 the average speed-up obtained for the top five queries
                 of TPC-H is over 15$ \times $.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '16 conference proceedings.",
}

@Article{Ozdal:2016:EEA,
  author =       "Muhammet Mustafa Ozdal and Serif Yesil and Taemin Kim
                 and Andrey Ayupov and John Greth and Steven Burns and
                 Ozcan Ozturk",
  title =        "Energy efficient architecture for graph analytics
                 accelerators",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "3",
  pages =        "166--177",
  month =        jun,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3007787.3001155",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:43 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Specialized hardware accelerators can significantly
                 improve the performance and power efficiency of compute
                 systems. In this paper, we focus on hardware
                 accelerators for graph analytics applications and
                 propose a configurable architecture template that is
                 specifically optimized for iterative vertex-centric
                 graph applications with irregular access patterns and
                 asymmetric convergence. The proposed architecture
                 addresses the limitations of the existing multi-core
                 CPU and GPU architectures for these types of
                 applications. The SystemC-based template we provide can
                 be customized easily for different vertex-centric
                 applications by inserting application-level data
                 structures and functions. After that, a cycle-accurate
                 simulator and RTL can be generated to model the target
                 hardware accelerators. In our experiments, we study
                 several graph-parallel applications, and show that the
                 hardware accelerators generated by our template can
                 outperform a 24 core high end server CPU system by up
                 to 3x in terms of performance. We also estimate the
                 area requirement and power consumption of these
                 hardware accelerators through physical-aware logic
                 synthesis, and show up to 65x better power consumption
                 with significantly smaller area.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '16 conference proceedings.",
}

@Article{Magaki:2016:ACS,
  author =       "Ikuo Magaki and Moein Khazraee and Luis Vega Gutierrez
                 and Michael Bedford Taylor",
  title =        "{ASIC} clouds: specializing the datacenter",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "3",
  pages =        "178--190",
  month =        jun,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3007787.3001156",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:43 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "GPU and FPGA-based clouds have already demonstrated
                 the promise of accelerating computing-intensive
                 workloads with greatly improved power and performance.
                 In this paper, we examine the design of ASIC Clouds,
                 which are purpose-built datacenters comprised of large
                 arrays of ASIC accelerators, whose purpose is to
                 optimize the total cost of ownership (TCO) of large,
                 high-volume chronic computations, which are becoming
                 increasingly common as more and more services are built
                 around the Cloud model. On the surface, the creation of
                 ASIC clouds may seem highly improbable due to high NREs
                 and the inflexibility of ASICs. Surprisingly, however,
                 large-scale ASIC Clouds have already been deployed by a
                 large number of commercial entities, to implement the
                 distributed Bitcoin cryptocurrency system. We begin
                 with a case study of Bitcoin mining ASIC Clouds, which
                 are perhaps the largest ASIC Clouds to date. From
                 there, we design three more ASIC Clouds, including a
                 YouTube-style video transcoding ASIC Cloud, a Litecoin
                 ASIC Cloud, and a Convolutional Neural Network ASIC
                 Cloud and show 2-3 orders of magnitude better TCO
                 versus CPU and GPU. Among our contributions, we present
                 a methodology that given an accelerator design, derives
                 Pareto-optimal ASIC Cloud Servers, by extracting data
                 from place-and-routed circuits and computational fluid
                 dynamic simulations, and then employing clever but
                 brute-force search to find the best jointly-optimized
                 ASIC, DRAM subsystem, motherboard, power delivery
                 system, cooling system, operating voltage, and case
                 design. Moreover, we show how data center parameters
                 determine which of the many Pareto-optimal points is
                 TCO-optimal. Finally we examine when it makes sense to
                 build an ASIC Cloud, and examine the impact of ASIC
                 NRE.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '16 conference proceedings.",
}

@Article{Oh:2016:AIC,
  author =       "Yunho Oh and Keunsoo Kim and Myung Kuk Yoon and Jong
                 Hyun Park and Yongjun Park and Won Woo Ro and Murali
                 Annavaram",
  title =        "{APRES}: improving cache efficiency by exploiting load
                 characteristics on {GPUs}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "3",
  pages =        "191--203",
  month =        jun,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3007787.3001158",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:43 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Long memory latency and limited throughput become
                 performance bottlenecks of GPGPU applications. The
                 latency takes hundreds of cycles which is difficult to
                 be hidden by simply interleaving tens of warp
                 execution. While cache hierarchy helps to reduce memory
                 system pressure, massive Thread-Level Parallelism (TLP)
                 often causes excessive cache contention. This paper
                 proposes Adaptive PREfetching and Scheduling (APRES) to
                 improve GPU cache efficiency. APRES relies on the
                 following observations. First, certain static load
                 instructions tend to generate memory addresses having
                 very high locality. Second, although loads have no
                 locality, the access addresses still can show highly
                 strided access pattern. Third, the locality behavior
                 tends to be consistent regardless of warp ID. APRES
                 schedules warps so that as many cache hits generated as
                 possible before any cache misses generated. This is to
                 minimize cache thrashing when many warps are contending
                 for a cache line. However, to realize this operation,
                 it is required to predict which warp will hit the cache
                 in the near future. Without directly predicting future
                 cache hit/miss for each warp, APRES creates a group of
                 warps that will execute the same load instruction in
                 the near future. Based on the third observation, we
                 expect the locality behavior is consistent over all
                 warps in the group. If the first executed warp in the
                 group hits the cache, then the load is considered as a
                 high locality type, and APRES prioritizes all warps in
                 the group. Group prioritization leads to consecutive
                 cache hits, because the grouped warps are likely to
                 access the same cache line. If the first warp missed
                 the cache, then the load is considered as a strided
                 type, and APRES generates prefetch requests for the
                 other warps in the group. After that, APRES prioritizes
                 prefetch targeted warps so that the demand requests are
                 merged to Miss Status Holding Register (MSHR) or
                 prefetched lines can be accessed. On memory-intensive
                 applications, APRES achieves 31.7\% performance
                 improvement compared to the baseline GPU and 7.2\%
                 additional speedup compared to the best combination of
                 existing warp scheduling and prefetching methods.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '16 conference proceedings.",
}

@Article{Hsieh:2016:TOM,
  author =       "Kevin Hsieh and Eiman Ebrahimi and Gwangsun Kim and
                 Niladrish Chatterjee and Mike O'Connor and Nandita
                 Vijaykumar and Onur Mutlu and Stephen W. Keckler",
  title =        "Transparent offloading and mapping {(TOM)}: enabling
                 programmer-transparent near-data processing in {GPU}
                 systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "3",
  pages =        "204--216",
  month =        jun,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3007787.3001159",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:43 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Main memory bandwidth is a critical bottleneck for
                 modern GPU systems due to limited off-chip pin
                 bandwidth. 3D-stacked memory architectures provide a
                 promising opportunity to significantly alleviate this
                 bottleneck by directly connecting a logic layer to the
                 DRAM layers with high bandwidth connections. Recent
                 work has shown promising potential performance benefits
                 from an architecture that connects multiple such
                 3D-stacked memories and offloads bandwidth-intensive
                 computations to a GPU in each of the logic layers. An
                 unsolved key challenge in such a system is how to
                 enable computation offloading and data mapping to
                 multiple 3D-stacked memories without burdening the
                 programmer such that any application can transparently
                 benefit from near-data processing capabilities in the
                 logic layer. Our paper develops two new mechanisms to
                 address this key challenge. First, a compiler-based
                 technique that automatically identifies code to offload
                 to a logic-layer GPU based on a simple cost-benefit
                 analysis. Second, a software/hardware cooperative
                 mechanism that predicts which memory pages will be
                 accessed by offloaded code, and places those pages in
                 the memory stack closest to the offloaded code, to
                 minimize off-chip bandwidth consumption. We call the
                 combination of these two programmer-transparent
                 mechanisms TOM: Transparent Offloading and Mapping. Our
                 extensive evaluations across a variety of modern
                 memory-intensive GPU workloads show that, without
                 requiring any program modification, TOM significantly
                 improves performance (by 30\% on average, and up to
                 76\%) compared to a baseline GPU system that cannot
                 offload computation to 3D-stacked memories.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '16 conference proceedings.",
}

@Article{Park:2016:ESFb,
  author =       "Chang Hyun Park and Taekyung Heo and Jaehyuk Huh",
  title =        "Efficient synonym filtering and scalable delayed
                 translation for hybrid virtual caching",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "3",
  pages =        "217--229",
  month =        jun,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3007787.3001160",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:43 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Conventional translation look-aside buffers (TLBs) are
                 required to complete address translation with short
                 latencies, as the address translation is on the
                 critical path of all memory accesses even for L1 cache
                 hits. Such strict TLB latency restrictions limit the
                 TLB capacity, as the latency increase with large TLBs
                 may lower the overall performance even with potential
                 TLB miss reductions. Furthermore, TLBs consume a
                 significant amount of energy as they are accessed for
                 every instruction fetch and data access. To avoid the
                 latency restriction and reduce the energy consumption,
                 virtual caching techniques have been proposed to defer
                 translation to after L1 cache misses. However, an
                 efficient solution for the synonym problem has been a
                 critical issue hindering the wide adoption of virtual
                 caching. Based on the virtual caching concept, this
                 study proposes a hybrid virtual memory architecture
                 extending virtual caching to the entire cache
                 hierarchy, aiming to improve both performance and
                 energy consumption. The hybrid virtual caching uses
                 virtual addresses augmented with address space
                 identifiers (ASID) in the cache hierarchy for common
                 non-synonym addresses. For such non-synonyms, the
                 address translation occurs only after last-level cache
                 (LLC) misses. For uncommon synonym addresses, the
                 addresses are translated to physical addresses with
                 conventional TLBs before L1 cache accesses. To support
                 such hybrid translation, we propose an efficient
                 synonym detection mechanism based on Bloom filters
                 which can identify synonym candidates with few false
                 positives. For large memory applications, delayed
                 translation alone cannot solve the address translation
                 problem, as fixed-granularity delayed TLBs may not
                 scale with the increasing memory requirements. To
                 mitigate the translation scalability problem, this
                 study proposes a delayed many segment translation
                 designed for the hybrid virtual caching. The
                 experimental results show that our approach effectively
                 lowers accesses to the TLBs, leading to significant
                 power savings. In addition, the approach provides
                 performance improvement with scalable delayed
                 translation with variable length segments.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '16 conference proceedings.",
}

@Article{Xu:2016:WSE,
  author =       "Qiumin Xu and Hyeran Jeon and Keunsoo Kim and Won Woo
                 Ro and Murali Annavaram",
  title =        "Warped-slicer: efficient intra-{SM} slicing through
                 dynamic resource partitioning for {GPU}
                 multiprogramming",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "3",
  pages =        "230--242",
  month =        jun,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3007787.3001161",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:43 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "As technology scales, GPUs are forecasted to
                 incorporate an ever-increasing amount of computing
                 resources to support thread-level parallelism. But even
                 with the best effort, exposing massive thread-level
                 parallelism from a single GPU kernel, particularly from
                 general purpose applications, is going to be a
                 difficult challenge. In some cases, even if there is
                 sufficient thread-level parallelism in a kernel, there
                 may not be enough available memory bandwidth to support
                 such massive concurrent thread execution. Hence, GPU
                 resources may be underutilized as more general purpose
                 applications are ported to execute on GPUs. In this
                 paper, we explore multiprogramming GPUs as a way to
                 resolve the resource underutilization issue. There is a
                 growing hardware support for multiprogramming on GPUs.
                 Hyper-Q has been introduced in the Kepler architecture
                 which enables multiple kernels to be invoked via tens
                 of hardware queue streams. Spatial multitasking has
                 been proposed to partition GPU resources across
                 multiple kernels. But the partitioning is done at the
                 coarse granularity of streaming multiprocessors (SMs)
                 where each kernel is assigned to a subset of SMs. In
                 this paper, we advocate for partitioning a single SM
                 across multiple kernels, which we term as intra-SM
                 slicing. We explore various intra-SM slicing strategies
                 that slice resources within each SM to concurrently run
                 multiple kernels on the SM. Our results show that there
                 is not one intra-SM slicing strategy that derives the
                 best performance for all application pairs. We propose
                 Warped-Slicer, a dynamic intra-SM slicing strategy that
                 uses an analytical method for calculating the SM
                 resource partitioning across different kernels that
                 maximizes performance. The model relies on a set of
                 short online profile runs to determine how each
                 kernel's performance varies as more thread blocks from
                 each kernel are assigned to an SM. The model takes into
                 account the interference effect of shared resource
                 usage across multiple kernels. The model is also
                 computationally efficient and can determine the
                 resource partitioning quickly to enable dynamic
                 decision making as new kernels enter the system. We
                 demonstrate that the proposed Warped-Slicer approach
                 improves performance by 23\% over the baseline
                 multiprogramming approach with minimal hardware
                 overhead.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '16 conference proceedings.",
}

@Article{Han:2016:EEI,
  author =       "Song Han and Xingyu Liu and Huizi Mao and Jing Pu and
                 Ardavan Pedram and Mark A. Horowitz and William J.
                 Dally",
  title =        "{EIE}: efficient inference engine on compressed deep
                 neural network",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "3",
  pages =        "243--254",
  month =        jun,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3007787.3001163",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:43 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "State-of-the-art deep neural networks (DNNs) have
                 hundreds of millions of connections and are both
                 computationally and memory intensive, making them
                 difficult to deploy on embedded systems with limited
                 hardware resources and power budgets. While custom
                 hardware helps the computation, fetching weights from
                 DRAM is two orders of magnitude more expensive than ALU
                 operations, and dominates the required power.
                 Previously proposed 'Deep Compression' makes it
                 possible to fit large DNNs (AlexNet and VGGNet) fully
                 in on-chip SRAM. This compression is achieved by
                 pruning the redundant connections and having multiple
                 connections share the same weight. We propose an energy
                 efficient inference engine (EIE) that performs
                 inference on this compressed network model and
                 accelerates the resulting sparse matrix-vector
                 multiplication with weight sharing. Going from DRAM to
                 SRAM gives EIE 120$ \times $ energy saving; Exploiting
                 sparsity saves 10$ \times $; Weight sharing gives 8$
                 \times $; Skipping zero activations from ReLU saves
                 another 3$ \times $. Evaluated on nine DNN benchmarks,
                 EIE is 189$ \times $ and 13$ \times $ faster when
                 compared to CPU and GPU implementations of the same DNN
                 without compression. EIE has a processing power of 102
                 GOPS working directly on a compressed network,
                 corresponding to 3 TOPS on an uncompressed network, and
                 processes FC layers of AlexNet at 1.88$ \times $10$^4$
                 frames/sec with a power dissipation of only 600mW. It
                 is 24,000$ \times $ and 3,400$ \times $ more energy
                 efficient than a CPU and GPU respectively. Compared
                 with DaDianNao, EIE has 2.9$ \times $, 19$ \times $ and
                 3$ \times $ better throughput, energy efficiency and
                 area efficiency.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '16 conference proceedings.",
}

@Article{LiKamWa:2016:RAC,
  author =       "Robert LiKamWa and Yunhui Hou and Julian Gao and Mia
                 Polansky and Lin Zhong",
  title =        "{RedEye}: analog {ConvNet} image sensor architecture
                 for continuous mobile vision",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "3",
  pages =        "255--266",
  month =        jun,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3007787.3001164",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:43 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Continuous mobile vision is limited by the inability
                 to efficiently capture image frames and process vision
                 features. This is largely due to the energy burden of
                 analog readout circuitry, data traffic, and intensive
                 computation. To promote efficiency, we shift early
                 vision processing into the analog domain. This results
                 in RedEye, an analog convolutional image sensor that
                 performs layers of a convolutional neural network in
                 the analog domain before quantization. We design RedEye
                 to mitigate analog design complexity, using a modular
                 column-parallel design to promote physical design reuse
                 and algorithmic cyclic reuse. RedEye uses programmable
                 mechanisms to admit noise for tunable energy reduction.
                 Compared to conventional systems, RedEye reports an
                 85\% reduction in sensor energy, 73\% reduction in
                 cloudlet-based system energy, and a 45\% reduction in
                 computation-based system energy.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '16 conference proceedings.",
}

@Article{Reagen:2016:MEL,
  author =       "Brandon Reagen and Paul Whatmough and Robert Adolf and
                 Saketh Rama and Hyunkwang Lee and Sae Kyu Lee and
                 Jos{\'e} Miguel Hern{\'a}ndez-Lobato and Gu-Yeon Wei
                 and David Brooks",
  title =        "{Minerva}: enabling low-power, highly-accurate deep
                 neural network accelerators",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "3",
  pages =        "267--278",
  month =        jun,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3007787.3001165",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:43 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "The continued success of Deep Neural Networks (DNNs)
                 in classification tasks has sparked a trend of
                 accelerating their execution with specialized hardware.
                 While published designs easily give an order of
                 magnitude improvement over general-purpose hardware,
                 few look beyond an initial implementation. This paper
                 presents Minerva, a highly automated co-design approach
                 across the algorithm, architecture, and circuit levels
                 to optimize DNN hardware accelerators. Compared to an
                 established fixed-point accelerator baseline, we show
                 that fine-grained, heterogeneous datatype optimization
                 reduces power by 1.5$ \times $; aggressive, inline
                 predication and pruning of small activity values
                 further reduces power by 2.0$ \times $; and active
                 hardware fault detection coupled with domain-aware
                 error mitigation eliminates an additional 2.7$ \times $
                 through lowering SRAM voltages. Across five datasets,
                 these optimizations provide a collective average of
                 8.1$ \times $ power reduction over an accelerator
                 baseline without compromising DNN model accuracy.
                 Minerva enables highly accurate, ultra-low power DNN
                 accelerators (in the range of tens of milliwatts),
                 making it feasible to deploy DNNs in power-constrained
                 IoT and mobile devices.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '16 conference proceedings.",
}

@Article{Yao:2016:OCO,
  author =       "Yuan Yao and Zhonghai Lu",
  title =        "Opportunistic competition overhead reduction for
                 expediting critical section in {NoC} based {CMPs}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "3",
  pages =        "279--290",
  month =        jun,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3007787.3001167",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:43 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "With the degree of parallelism increasing, performance
                 of multi-threaded shared variable applications is not
                 only limited by serialized critical section execution,
                 but also by the serialized competition overhead for
                 threads to get access to critical section. As the
                 number of concurrent threads grows, such competition
                 overhead may exceed the time spent in critical section
                 itself, and become the dominating factor limiting the
                 performance of parallel applications. In modern
                 operating systems, queue spinlock, which comprises a
                 low-overhead spinning phase and a high-overhead
                 sleeping phase, is often used to lock critical
                 sections. In the paper, we show that this advanced
                 locking solution may create very high competition
                 overhead for multithreaded applications executing in
                 NoC-based CMPs. Then we propose a software-hardware
                 cooperative mechanism that can opportunistically
                 maximize the chance that a thread wins the critical
                 section access in the low-overhead spinning phase,
                 thereby reducing the competition overhead. At the OS
                 primitives level, we monitor the remaining times of
                 retry (RTR) in a thread's spinning phase, which
                 reflects in how long the thread must enter into the
                 high-overhead sleep mode. At the hardware level, we
                 integrate the RTR information into the packets of
                 locking requests, and let the NoC prioritize locking
                 request packets according to the RTR information. The
                 principle is that the smaller RTR a locking request
                 packet carries, the higher priority it gets and thus
                 quicker delivery. We evaluate our opportunistic
                 competition overhead reduction technique with
                 cycle-accurate full-system simulations in GEM5 using
                 PARSEC (11 programs) and SPEC OMP2012 (14 programs)
                 benchmarks. Compared to the original queue spinlock
                 implementation, experimental results show that our
                 method can effectively increase the opportunity of
                 threads entering the critical section in low-overhead
                 spinning phase, reducing the competition overhead
                 averagely by 39.9\% (maximally by 61.8\%) and
                 accelerating the execution of the Region-of-Interest
                 averagely by 14.4\% (maximally by 24.5\%) across all 25
                 benchmark programs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '16 conference proceedings.",
}

@Article{Kim:2016:SCD,
  author =       "Channoh Kim and Sungmin Kim and Hyeon Gyu Cho and
                 Dooyoung Kim and Jaehyeok Kim and Young H. Oh and
                 Hakbeom Jang and Jae W. Lee",
  title =        "Short-circuit dispatch: accelerating virtual machine
                 interpreters on embedded processors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "3",
  pages =        "291--303",
  month =        jun,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3007787.3001168",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:43 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "Interpreters are widely used to implement high-level
                 language virtual machines (VMs), especially on
                 resource-constrained embedded platforms. Many scripting
                 languages employ interpreter-based VMs for their
                 advantages over native code compilers, such as
                 portability, smaller resource footprint, and compact
                 codes. For efficient interpretation a script (program)
                 is first compiled into an intermediate representation,
                 or bytecodes. The canonical interpreter then runs an
                 infinite loop that fetches, decodes, and executes one
                 bytecode at a time. This bytecode dispatch loop is a
                 well-known source of inefficiency, typically featuring
                 a large jump table with a hard-to-predict indirect
                 jump. Most existing techniques to optimize this loop
                 focus on reducing the misprediction rate of this
                 indirect jump in both hardware and software. However,
                 these techniques are much less effective on embedded
                 processors with shallow pipelines and low IPCs.
                 Instead, we tackle another source of inefficiency more
                 prominent on embedded platforms--redundant computation
                 in the dispatch loop. To this end, we propose
                 Short-Circuit Dispatch (SCD), a low-cost architectural
                 extension that enables fast, hardware-based bytecode
                 dispatch with fewer instructions. The key idea of SCD
                 is to overlay the software-created bytecode jump table
                 on a branch target buffer (BTB). Once a bytecode is
                 fetched, the BTB is looked up using the bytecode,
                 instead of PC, as key. If it hits, the interpreter
                 directly jumps to the target address retrieved from the
                 BTB; otherwise, it goes through the original dispatch
                 path. This effectively eliminates redundant computation
                 in the dispatcher code for decode, bound check, and
                 target address calculation, thus significantly reducing
                 total instruction count. Our simulation results
                 demonstrate that SCD achieves geomean speedups of
                 19.9\% and 14.1\% for two production-grade script
                 interpreters for Lua and JavaScript, respectively.
                 Moreover, our fully synthesizable RTL design based on a
                 RISC-V embedded processor shows that SCD improves the
                 EDP of the Lua interpreter by 24.2\%, while increasing
                 the chip area by only 0.72\% at a 40nm technology
                 node.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '16 conference proceedings.",
}

@Article{Dall:2016:AVP,
  author =       "Christoffer Dall and Shih-Wei Li and Jin Tack Lim and
                 Jason Nieh and Georgios Koloventzos",
  title =        "{ARM} virtualization: performance and architectural
                 implications",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "3",
  pages =        "304--316",
  month =        jun,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3007787.3001169",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:43 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "ARM servers are becoming increasingly common, making
                 server technologies such as virtualization for ARM of
                 growing importance. We present the first study of ARM
                 virtualization performance on server hardware,
                 including multicore measurements of two popular ARM and
                 x86 hypervisors, KVM and Xen. We show how ARM hardware
                 support for virtualization can enable much faster
                 transitions between VMs and the hypervisor, a key
                 hypervisor operation. However, current hypervisor
                 designs, including both Type 1 hypervisors such as Xen
                 and Type 2 hypervisors such as KVM, are not able to
                 leverage this performance benefit for real application
                 workloads. We discuss the reasons why and show that
                 other factors related to hypervisor software design and
                 implementation have a larger role in overall
                 performance. Based on our measurements, we discuss
                 changes to ARM's hardware virtualization support that
                 can potentially bridge the gap to bring its faster
                 VM-to-hypervisor transition mechanism to modern Type 2
                 hypervisors running real applications. These changes
                 have been incorporated into the latest ARM
                 architecture.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '16 conference proceedings.",
}

@Article{Gaur:2016:BVC,
  author =       "Jayesh Gaur and Alaa R. Alameldeen and Sreenivas
                 Subramoney",
  title =        "Base-victim compression: an opportunistic cache
                 compression architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "3",
  pages =        "317--328",
  month =        jun,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3007787.3001171",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:43 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/datacompression.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "The memory wall has motivated many enhancements to
                 cache management policies aimed at reducing misses.
                 Cache compression has been proposed to increase
                 effective cache capacity, which potentially reduces
                 capacity and conflict misses. However, complexity in
                 cache compression implementations could increase cache
                 power and access latency. On the other hand, advanced
                 cache replacement mechanisms use heuristics to reduce
                 misses, leading to significant performance gains. Both
                 cache compression and replacement policies should
                 collaborate to improve performance. In this paper, we
                 demonstrate that cache compression and replacement
                 policies can interact negatively. In many workloads,
                 performance gains from replacement policies are lost
                 due to the need to alter the replacement policy to
                 accommodate compression. This leads to sub-optimal
                 replacement policies that could lose performance
                 compared to an uncompressed cache. We introduce a
                 novel, opportunistic cache compression mechanism,
                 Base-Victim, based on an efficient cache design. Our
                 compression architecture improves performance on top of
                 advanced cache replacement policies, and guarantees a
                 hit rate at least as high as that of an uncompressed
                 cache. For cache-sensitive applications, Base-Victim
                 achieves an average 7.3\% performance gain for
                 single-threaded workloads, and 8.7\% gain for
                 four-thread multi-program workload mixes.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '16 conference proceedings.",
}

@Article{Kim:2016:BPC,
  author =       "Jungrae Kim and Michael Sullivan and Esha Choukse and
                 Mattan Erez",
  title =        "Bit-plane compression: transforming data for better
                 compression in many-core architectures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "3",
  pages =        "329--340",
  month =        jun,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3007787.3001172",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:43 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "As key applications become more data-intensive and the
                 computational throughput of processors increases, the
                 amount of data to be transferred in modern memory
                 subsystems grows. Increasing physical bandwidth to keep
                 up with the demand growth is challenging, however, due
                 to strict area and energy limitations. This paper
                 presents a novel and lightweight compression algorithm,
                 Bit-Plane Compression (BPC), to increase the effective
                 memory bandwidth. BPC aims at homogeneously-typed
                 memory blocks, which are prevalent in many-core
                 architectures, and applies a smart data transformation
                 to both improve the inherent data compressibility and
                 to reduce the complexity of compression hardware. We
                 demonstrate that BPC provides superior compression
                 ratios of 4.1:1 for integer benchmarks and reduces
                 memory bandwidth requirements significantly.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '16 conference proceedings.",
}

@Article{Nair:2016:XEE,
  author =       "Prashant J. Nair and Vilas Sridharan and Moinuddin K.
                 Qureshi",
  title =        "{XED}: exposing on-die error detection information for
                 strong memory reliability",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "3",
  pages =        "341--353",
  month =        jun,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3007787.3001174",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:43 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Large-granularity memory failures continue to be a
                 critical impediment to system reliability. To make
                 matters worse, as DRAM scales to smaller nodes, the
                 frequency of unreliable bits in DRAM chips continues to
                 increase. To mitigate such scaling-related failures,
                 memory vendors are planning to equip existing DRAM
                 chips with On-Die ECC. For maintaining compatibility
                 with memory standards, On-Die ECC is kept invisible
                 from the memory controller. This paper explores how to
                 design high reliability memory systems in presence of
                 On-Die ECC. We show that if On-Die ECC is not exposed
                 to the memory system, having a 9-chip ECC-DIMM
                 (implementing SECDED) provides almost no reliability
                 benefits compared to an 8-chip non-ECC DIMM. We also
                 show that if the error detection of On-Die ECC can be
                 exposed to the memory controller, then Chipkill-level
                 reliability can be achieved even with a 9-chip
                 ECC-DIMM. To this end, we propose eXposed On-Die Error
                 Detection (XED), which exposes the On-Die error
                 detection information without requiring changes to the
                 memory standards or consuming bandwidth overheads. When
                 the On-Die ECC detects an error, XED transmits a
                 pre-defined ``catch-word'' instead of the corrected
                 data value. On receiving the catch-word, the memory
                 controller uses the parity stored in the 9-chip of the
                 ECC-DIMM to correct the faulty chip (similar to
                 RAID-3). Our studies show that XED provides
                 Chipkill-level reliability (172x higher than SECDED),
                 while incurring negligible overheads, with a 21\% lower
                 execution time than Chipkill. We also show that XED can
                 enable Chipkill systems to provide Double-Chipkill
                 level reliability while avoiding the associated
                 storage, performance, and power overheads.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '16 conference proceedings.",
}

@Article{ulAlam:2016:PRS,
  author =       "Mohammad Mejbah ul Alam and Abdullah Muzahid",
  title =        "Production-run software failure diagnosis via
                 \underline{a}daptive \underline{c}ommunication
                 \underline{t}racking",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "3",
  pages =        "354--366",
  month =        jun,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3007787.3001175",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:43 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Software failure diagnosis techniques work either by
                 sampling some events at production-run time or by using
                 some bug detection algorithms. Some of the techniques
                 require the failure to be reproduced multiple times.
                 The ones that do not require such, are not adaptive
                 enough when the execution platform, environment or code
                 changes. We propose ACT, a diagnosis technique for
                 production-run failures, that uses the machine
                 intelligence of neural hardware. ACT learns some
                 invariants (e.g., data communication invariants)
                 on-the-fly using the neural hardware and records any
                 potential violation of them. Since ACT can learn
                 invariants on-the-fly, it can adapt to any change in
                 execution setting or code. Since it records only the
                 potentially violated invariants, the postprocessing
                 phase can pinpoint the root cause fairly accurately
                 without requiring to observe the failure again. ACT
                 works seamlessly for many sequential and concurrency
                 bugs. The paper provides a detailed design and
                 implementation of ACT in a typical multiprocessor
                 system. It uses a three stage pipeline for partially
                 configurable one hidden layer neural networks. We have
                 evaluated ACT on a variety of programs from popular
                 benchmarks as well as open source programs. ACT
                 diagnoses failures caused by 16 bugs from these
                 programs with accurate ranking. Compared to existing
                 learning and sampling based approaches, ACT has better
                 diagnostic ability. For the default configuration, ACT
                 has an average execution overhead of 8.2\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '16 conference proceedings.",
}

@Article{Chen:2016:ESA,
  author =       "Yu-Hsin Chen and Joel Emer and Vivienne Sze",
  title =        "{Eyeriss}: a spatial architecture for energy-efficient
                 dataflow for convolutional neural networks",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "3",
  pages =        "367--379",
  month =        jun,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3007787.3001177",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:43 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Deep convolutional neural networks (CNNs) are widely
                 used in modern AI systems for their superior accuracy
                 but at the cost of high computational complexity. The
                 complexity comes from the need to simultaneously
                 process hundreds of filters and channels in the
                 high-dimensional convolutions, which involve a
                 significant amount of data movement. Although
                 highly-parallel compute paradigms, such as SIMD/SIMT,
                 effectively address the computation requirement to
                 achieve high throughput, energy consumption still
                 remains high as data movement can be more expensive
                 than computation. Accordingly, finding a dataflow that
                 supports parallel processing with minimal data movement
                 cost is crucial to achieving energy-efficient CNN
                 processing without compromising accuracy. In this
                 paper, we present a novel dataflow, called
                 row-stationary (RS), that minimizes data movement
                 energy consumption on a spatial architecture. This is
                 realized by exploiting local data reuse of filter
                 weights and feature map pixels, i.e., activations, in
                 the high-dimensional convolutions, and minimizing data
                 movement of partial sum accumulations. Unlike dataflows
                 used in existing designs, which only reduce certain
                 types of data movement, the proposed RS dataflow can
                 adapt to different CNN shape configurations and reduces
                 all types of data movement through maximally utilizing
                 the processing engine (PE) local storage, direct
                 inter-PE communication and spatial parallelism. To
                 evaluate the energy efficiency of the different
                 dataflows, we propose an analysis framework that
                 compares energy cost under the same hardware area and
                 processing parallelism constraints. Experiments using
                 the CNN configurations of AlexNet show that the
                 proposed RS dataflow is more energy efficient than
                 existing dataflows in both convolutional (1.4$ \times $
                 to 2.5$ \times $) and fully-connected layers (at least
                 1.3$ \times $ for batch size larger than 16). The RS
                 dataflow has also been demonstrated on a fabricated
                 chip, which verifies our energy analysis.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '16 conference proceedings.",
}

@Article{Kim:2016:NPD,
  author =       "Duckhwan Kim and Jaeha Kung and Sek Chai and Sudhakar
                 Yalamanchili and Saibal Mukhopadhyay",
  title =        "{Neurocube}: a programmable digital neuromorphic
                 architecture with high-density {$3$D} memory",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "3",
  pages =        "380--392",
  month =        jun,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3007787.3001178",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:43 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "This paper presents a programmable and scalable
                 digital neuromorphic architecture based on 3D
                 high-density memory integrated with logic tier for
                 efficient neural computing. The proposed architecture
                 consists of clusters of processing engines, connected
                 by 2D mesh network as a processing tier, which is
                 integrated in 3D with multiple tiers of DRAM. The PE
                 clusters access multiple memory channels (vaults) in
                 parallel. The operating principle, referred to as the
                 memory centric computing, embeds specialized
                 state-machines within the vault controllers of HMC to
                 drive data into the PE clusters. The paper presents the
                 basic architecture of the Neurocube and an analysis of
                 the logic tier synthesized in 28nm and 15nm process
                 technologies. The performance of the Neurocube is
                 evaluated and illustrated through the mapping of a
                 Convolutional Neural Network and estimating the
                 subsequent power and performance for both training and
                 inference.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '16 conference proceedings.",
}

@Article{Liu:2016:CIS,
  author =       "Shaoli Liu and Zidong Du and Jinhua Tao and Dong Han
                 and Tao Luo and Yuan Xie and Yunji Chen and Tianshi
                 Chen",
  title =        "{Cambricon}: an instruction set architecture for
                 neural networks",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "3",
  pages =        "393--405",
  month =        jun,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3007787.3001179",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:43 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Neural Networks (NN) are a family of models for a
                 broad range of emerging machine learning and pattern
                 recondition applications. NN techniques are
                 conventionally executed on general-purpose processors
                 (such as CPU and GPGPU), which are usually not
                 energy-efficient since they invest excessive hardware
                 resources to flexibly support various workloads.
                 Consequently, application-specific hardware
                 accelerators for neural networks have been proposed
                 recently to improve the energy-efficiency. However,
                 such accelerators were designed for a small set of NN
                 techniques sharing similar computational patterns, and
                 they adopt complex and informative instructions
                 (control signals) directly corresponding to high-level
                 functional blocks of an NN (such as layers), or even an
                 NN as a whole. Although straightforward and
                 easy-to-implement for a limited set of similar NN
                 techniques, the lack of agility in the instruction set
                 prevents such accelerator designs from supporting a
                 variety of different NN techniques with sufficient
                 flexibility and efficiency. In this paper, we propose a
                 novel domain-specific Instruction Set Architecture
                 (ISA) for NN accelerators, called Cambricon, which is a
                 load-store architecture that integrates scalar, vector,
                 matrix, logical, data transfer, and control
                 instructions, based on a comprehensive analysis of
                 existing NN techniques. Our evaluation over a total of
                 ten representative yet distinct NN techniques have
                 demonstrated that Cambricon exhibits strong descriptive
                 capacity over a broad range of NN techniques, and
                 provides higher code density than general-purpose ISAs
                 such as $ \times $86, MIPS, and GPGPU. Compared to the
                 latest state-of-the-art NN accelerator design DaDianNao
                 [5] (which can only accommodate 3 types of NN
                 techniques), our Cambricon-based accelerator prototype
                 implemented in TSMC 65nm technology incurs only
                 negligible latency/power/area overheads, with a
                 versatile coverage of 10 different NN benchmarks.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '16 conference proceedings.",
}

@Article{Huang:2016:DLN,
  author =       "Ziqiang Huang and Andrew D. Hilton and Benjamin C.
                 Lee",
  title =        "Decoupling loads for nano-instruction set computers",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "3",
  pages =        "406--417",
  month =        jun,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3007787.3001181",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:43 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "We propose an ISA extension that decouples the data
                 access and register write operations in a load
                 instruction. We describe system and hardware support
                 for decoupled loads. Furthermore, we show how compilers
                 can generate better static instruction schedules by
                 hoisting a decoupled load's data access above may-alias
                 stores and branches. We find that decoupled loads
                 improve performance with geometric mean speedups of
                 8.4\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '16 conference proceedings.",
}

@Article{Hayes:2016:FVM,
  author =       "Timothy Hayes and Oscar Palomar and Osman Unsal and
                 Adrian Cristal and Mateo Valero",
  title =        "Future vector microprocessor extensions for data
                 aggregations",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "3",
  pages =        "418--430",
  month =        jun,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3007787.3001182",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:43 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "As the rate of annual data generation grows
                 exponentially, there is a demand to aggregate and
                 summarise vast amounts of information quickly. In the
                 past, frequency scaling was relied upon to push
                 application throughput. Today, Dennard scaling has
                 ceased and further performance must come from
                 exploiting parallelism. Single instruction-multiple
                 data (SIMD) instruction sets offer a highly efficient
                 and scalable way of exploiting data-level parallelism
                 (DLP). While microprocessors originally offered very
                 simple SIMD support targeted at multimedia
                 applications, these extensions have been growing both
                 in width and functionality. Observing this trend, we
                 use a simulation framework to model future SIMD support
                 and then propose and evaluate five different ways of
                 vectorising data aggregation. We find that although
                 data aggregation is abundant in DLP, it is often too
                 irregular to be expressed efficiently using typical
                 SIMD instructions. Based on this observation, we
                 propose a set of novel algorithms and SIMD instructions
                 to better capture this irregular DLP. Furthermore, we
                 discover that the best algorithm is highly dependent on
                 the characteristics of the input. Our proposed solution
                 can dynamically choose the optimal algorithm in the
                 majority of cases and achieves speedups between 2.7 $
                 \times $ and 7.6 $ \times $ over a scalar baseline.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '16 conference proceedings.",
}

@Article{Sleiman:2016:ESO,
  author =       "Faissal M. Sleiman and Thomas F. Wenisch",
  title =        "Efficiently scaling out-of-order cores for
                 simultaneous multithreading",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "3",
  pages =        "431--443",
  month =        jun,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3007787.3001183",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:43 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Simultaneous multithreading (SMT) out-of-order cores
                 waste a significant portion of structural out-of-order
                 core resources on instructions that do not need them.
                 These resources eliminate false ordering dependences.
                 However, because thread interleaving spreads dependent
                 instructions, nearly half of instructions dynamically
                 issue in program order after all false dependences have
                 resolved. These in-sequence instructions interleave
                 with other reordered instructions at a fine granularity
                 within the instruction window. We develop a technique
                 to efficiently scale in-flight instructions through a
                 hybrid out-of-order/in-order microarchitecture, which
                 can dispatch instructions to efficient in-order
                 scheduling mechanisms---using a FIFO issue queue called
                 the shelf ---on an instruction-by-instruction basis.
                 Instructions dispatched to the shelf do not allocate
                 out-of-order core resources in the reorder buffer,
                 issue queue, physical registers, or load-store queues.
                 We measure opportunity for such hybrid
                 microarchitectures and design and evaluate a practical
                 dispatch mechanism targeted at 4-threaded cores. Adding
                 a shelf to a baseline 4-thread system with 64-entry ROB
                 improves normalized system throughput by 11.5\% (up to
                 19.2\% at best) and energy-delay product by 10.9\% (up
                 to 17.5\% at best).",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '16 conference proceedings.",
}

@Article{Hashemi:2016:ADC,
  author =       "Milad Hashemi and Khubaib and Eiman Ebrahimi and Onur
                 Mutlu and Yale N. Patt",
  title =        "Accelerating dependent cache misses with an enhanced
                 memory controller",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "3",
  pages =        "444--455",
  month =        jun,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3007787.3001184",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:43 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "On-chip contention increases memory access latency for
                 multicore processors. We identify that this additional
                 latency has a substantial effect on performance for an
                 important class of latency-critical memory operations:
                 those that result in a cache miss and are dependent on
                 data from a prior cache miss. We observe that the
                 number of instructions between the first cache miss and
                 its dependent cache miss is usually small. To minimize
                 dependent cache miss latency, we propose adding just
                 enough functionality to dynamically identify these
                 instructions at the core and migrate them to the memory
                 controller for execution as soon as source data arrives
                 from DRAM. This migration allows memory requests issued
                 by our new Enhanced Memory Controller (EMC) to
                 experience a 20\% lower latency than if issued by the
                 core. On a set of memory intensive quad-core workloads,
                 the EMC results in a 13\% improvement in system
                 performance and a 5\% reduction in energy consumption
                 over a system with a Global History Bufer prefetcher,
                 the highest performing prefetcher in our evaluation.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '16 conference proceedings.",
}

@Article{Zhang:2016:TAS,
  author =       "Yunqi Zhang and David Meisner and Jason Mars and
                 Lingjia Tang",
  title =        "{Treadmill}: attributing the source of tail latency
                 through precise load testing and statistical
                 inference",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "3",
  pages =        "456--468",
  month =        jun,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3007787.3001186",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:43 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Managing tail latency of requests has become one of
                 the primary challenges for large-scale Internet
                 services. Data centers are quickly evolving and service
                 operators frequently desire to make changes to the
                 deployed software and production hardware
                 configurations. Such changes demand a confident
                 understanding of the impact on one's service, in
                 particular its effect on tail latency (e.g., 95th- or
                 99th-percentile response latency of the service).
                 Evaluating the impact on the tail is challenging
                 because of its inherent variability. Existing tools and
                 methodologies for measuring these effects suffer from a
                 number of deficiencies including poor load tester
                 design, statistically inaccurate aggregation, and
                 improper attribution of effects. As shown in the paper,
                 these pitfalls can often result in misleading
                 conclusions. In this paper, we develop a methodology
                 for statistically rigorous performance evaluation and
                 performance factor attribution for server workloads.
                 First, we find that careful design of the server load
                 tester can ensure high quality performance evaluation,
                 and empirically demonstrate the inaccuracy of load
                 testers in previous work. Learning from the design
                 flaws in prior work, we design and develop a modular
                 load tester platform, Treadmill, that overcomes
                 pitfalls of existing tools. Next, utilizing Treadmill,
                 we construct measurement and analysis procedures that
                 can properly attribute performance factors. We rely on
                 statistically-sound performance evaluation and quantile
                 regression, extending it to accommodate the
                 idiosyncrasies of server systems. Finally, we use our
                 augmented methodology to evaluate the impact of common
                 server hardware features with Facebook production
                 workloads on production hardware. We decompose the
                 effects of these features on request tail latency and
                 demonstrate that our evaluation methodology provides
                 superior results, particularly in capturing complicated
                 and counter-intuitive performance behaviors. By tuning
                 the hardware features as suggested by the attribution,
                 we reduce the 99th-percentile latency by 43\% and its
                 variance by 93\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '16 conference proceedings.",
}

@Article{Wu:2016:DFD,
  author =       "Qiang Wu and Qingyuan Deng and Lakshmi Ganesh and
                 Chang-Hong Hsu and Yun Jin and Sanjeev Kumar and Bin Li
                 and Justin Meza and Yee Jiun Song",
  title =        "{Dynamo}: facebook's data center-wide power management
                 system",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "3",
  pages =        "469--480",
  month =        jun,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3007787.3001187",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:43 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Data center power is a scarce resource that often goes
                 underutilized due to conservative planning. This is
                 because the penalty for overloading the data center
                 power delivery hierarchy and tripping a circuit breaker
                 is very high, potentially causing long service outages.
                 Recently, dynamic server power capping, which limits
                 the amount of power consumed by a server, has been
                 proposed and studied as a way to reduce this penalty,
                 enabling more aggressive utilization of provisioned
                 data center power. However, no real at-scale solution
                 for data center-wide power monitoring and control has
                 been presented in the literature. In this paper, we
                 describe Dynamo --- a data center-wide power management
                 system that monitors the entire power hierarchy and
                 makes coordinated control decisions to safely and
                 efficiently use provisioned data center power. Dynamo
                 has been developed and deployed across all of
                 Facebook's data centers for the past three years. Our
                 key insight is that in real-world data centers,
                 different power and performance constraints at
                 different levels in the power hierarchy necessitate
                 coordinated data center-wide power management. We make
                 three main contributions. First, to understand the
                 design space of Dynamo, we provide a characterization
                 of power variation in data centers running a diverse
                 set of modern workloads. This characterization uses
                 fine-grained power samples from tens of thousands of
                 servers and spanning a period of over six months.
                 Second, we present the detailed design of Dynamo. Our
                 design addresses several key issues not addressed by
                 previous simulation-based studies. Third, the proposed
                 techniques and design have been deployed and evaluated
                 in large scale data centers serving billions of users.
                 We present production results showing that Dynamo has
                 prevented 18 potential power outages in the past 6
                 months due to unexpected power surges; that Dynamo
                 enables optimizations leading to a 13\% performance
                 boost for a production Hadoop cluster and a nearly 40\%
                 performance increase for a search cluster; and that
                 Dynamo has already enabled an 8\% increase in the power
                 capacity utilization of one of our data centers with
                 more aggressive power subscription measures underway.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '16 conference proceedings.",
}

@Article{Wong:2016:PEA,
  author =       "Daniel Wong",
  title =        "Peak efficiency aware scheduling for highly energy
                 proportional servers",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "3",
  pages =        "481--492",
  month =        jun,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3007787.3001188",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:43 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Energy proportionality of data center severs have
                 improved drastically over the past decade to the point
                 where near ideal energy proportional servers are now
                 common. These highly energy proportional servers
                 exhibit the unique property where peak efficiency no
                 longer coincides with peak utilization. In this paper,
                 we explore the implications of this property on data
                 center scheduling. We identified that current state of
                 the art data center schedulers does not efficiently
                 leverage these properties, leading to inefficient
                 scheduling decisions. We propose Peak Efficiency Aware
                 Scheduling (PEAS) which can achieve better-than-ideal
                 energy proportionality at the data center level. We
                 demonstrate that PEAS can reduce average power by
                 25.5\% with 3.0\% improvement to TCO compared to
                 state-of-the-art scheduling policies.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '16 conference proceedings.",
}

@Article{Li:2016:PAD,
  author =       "Chao Li and Zhenhua Wang and Xiaofeng Hou and Haopeng
                 Chen and Xiaoyao Liang and Minyi Guo",
  title =        "Power attack defense: securing battery-backed data
                 centers",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "3",
  pages =        "493--505",
  month =        jun,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3007787.3001189",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:43 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Battery systems are crucial components for
                 mission-critical data centers. Without secure energy
                 backup, existing under-provisioned data centers are
                 largely unguarded targets for cyber criminals.
                 Particularly for today's scale-out servers, power
                 oversubscription unavoidably taxes a data center's
                 backup energy resources, leaving very little room for
                 dealing with emergency. Besides, the emerging trend
                 towards deploying distributed energy storage
                 architecture causes the associated energy backup of
                 each rack to shrink, making servers vulnerable to power
                 anomalies. As a result, an attacker can generate power
                 peaks to easily crash or disrupt a power-constrained
                 system. This study aims at securing data centers from
                 malicious loads that seek to drain their precious
                 energy storage and overload server racks without prior
                 detection. We term such load as Power Virus (PV) and
                 demonstrate its basic two-phase attacking model and
                 characterize its behaviors on real systems. The PV can
                 learn the victim rack's battery characteristics by
                 disguising as benign loads. Once gaining enough
                 information, the PV can be mutated to generate hidden
                 power spikes that have a high chance to overload the
                 system. To defend against PV, we propose power attack
                 defense (PAD), a novel energy management patch built on
                 lightweight software and hardware mechanisms. PAD not
                 only increases the attacking cost considerably by
                 hiding vulnerable racks from visible spikes, it also
                 strengthens the last line of defense against hidden
                 spikes. Using Google cluster traces we show that PAD
                 can effectively raise the bar of a successful power
                 attack: compared to prior arts, it increases the data
                 center survival time by 1.6~11X and provides better
                 performance guarantee. It enables modern data centers
                 to safely exploit the benefits that power
                 oversubscription may provide, with the slightest cost
                 overhead.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '16 conference proceedings.",
}

@Article{Gao:2016:DLP,
  author =       "Mingyu Gao and Christina Delimitrou and Dimin Niu and
                 Krishna T. Malladi and Hongzhong Zheng and Bob Brennan
                 and Christos Kozyrakis",
  title =        "{DRAF}: a low-power {DRAM}-based reconfigurable
                 acceleration fabric",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "3",
  pages =        "506--518",
  month =        jun,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3007787.3001191",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:43 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "FPGAs are a popular target for application-specific
                 accelerators because they lead to a good balance
                 between flexibility and energy efficiency. However,
                 FPGA lookup tables introduce significant area and power
                 overheads, making it difficult to use FPGA devices in
                 environments with tight cost and power constraints.
                 This is the case for datacenter servers, where a
                 modestly-sized FPGA cannot accommodate the large number
                 of diverse accelerators that datacenter applications
                 need. This paper introduces DRAF, an architecture for
                 bit-level reconfigurable logic that uses DRAM subarrays
                 to implement dense lookup tables. DRAF overlaps DRAM
                 operations like bitline precharge and charge
                 restoration with routing within the reconfigurable
                 routing fabric to minimize the impact of DRAM latency.
                 It also supports multiple configuration contexts that
                 can be used to quickly switch between different
                 accelerators with minimal latency. Overall, DRAF trades
                 off some of the performance of FPGAs for significant
                 gains in area and power. DRAF improves area density by
                 10x over FPGAs and power consumption by more than 3x,
                 enabling DRAF to satisfy demanding applications within
                 strict power and cost constraints. While accelerators
                 mapped to DRAF are 2-3x slower than those in FPGAs,
                 they still deliver a 13x speedup and an 11x reduction
                 in power consumption over a Xeon core for a wide range
                 of datacenter tasks, including analytics and
                 interactive services like speech recognition.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '16 conference proceedings.",
}

@Article{Zhang:2016:MWE,
  author =       "Lunkai Zhang and Brian Neely and Diana Franklin and
                 Dmitri Strukov and Yuan Xie and Frederic T. Chong",
  title =        "{Mellow Writes}: extending lifetime in resistive
                 memories through selective slow write backs",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "3",
  pages =        "519--531",
  month =        jun,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3007787.3001192",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:43 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Emerging resistive memory technologies, such as PCRAM
                 and ReRAM, have been proposed as promising replacements
                 for DRAM-based main memory, due to their better
                 scalability, low standby power, and non-volatility.
                 However, limited write endurance is a major drawback
                 for such resistive memory technologies. Wear leveling
                 (balancing the distribution of writes) and wear
                 limiting (reducing the number of writes) have been
                 proposed to mitigate this disadvantage, but both
                 techniques only manage a fixed budget of writes to a
                 memory system rather than increase the number
                 available. In this paper, we propose a new type of wear
                 limiting technique, Mellow Writes, which reduces the
                 wearout of individual writes rather than reducing the
                 number of writes. Mellow Writes is based on the fact
                 that slow writes performed with lower dissipated power
                 can lead to longer endurance (and therefore longer
                 lifetimes). For non-volatile memories, an N$^1$ to
                 N$^3$ times endurance can be achieved if the write
                 operation is slowed down by N times. We present three
                 microarchitectural mechanisms ( Bank-Aware Mellow
                 Writes, Eager Mellow Writes, and Wear Quota ) that
                 selectively perform slow writes to increase memory
                 lifetime while minimizing performance impact. Assuming
                 a factor N$^2$ advantage in cell endurance for a factor
                 N slower write, our best Mellow Writes mechanism can
                 achieve 2.58$ \times $ lifetime and 1.06$ \times $
                 performance of the baseline system. In addition, its
                 performance is almost the same as a system aggressively
                 optimized for performance (at the expense of
                 endurance). Finally, Wear Quota guarantees a minimal
                 lifetime (e.g., 8 years) by forcing more slow writes in
                 presence of heavy workloads. We also perform
                 sensitivity analysis on the endurance advantage factor
                 for slow writes, from N$^1$ to N$^3$, and find that our
                 technique is still useful for factors as low as
                 N$^1$.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '16 conference proceedings.",
}

@Article{Zhou:2016:MMI,
  author =       "Yanqi Zhou and David Wentzlaff",
  title =        "{MITTS}: memory inter-arrival time traffic shaping",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "3",
  pages =        "532--544",
  month =        jun,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3007787.3001193",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:43 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Memory bandwidth severely limits the scalability and
                 performance of multicore and manycore systems.
                 Application performance can be very sensitive to both
                 the delivered memory bandwidth and latency. In
                 multicore systems, a memory channel is usually shared
                 by multiple cores. Having the ability to precisely
                 provision, schedule, and isolate memory bandwidth and
                 latency on a per-core basis is particularly important
                 when different memory guarantees are needed on a
                 per-customer, per-application, or per-core basis.
                 Infrastructure as a Service (IaaS) Cloud systems, and
                 even general purpose multicores optimized for
                 application throughput or fairness all benefit from the
                 ability to control and schedule memory access on a
                 fine-grain basis. In this paper, we propose MITTS
                 (Memory Inter-arrival Time Traffic Shaping), a simple,
                 distributed hardware mechanism which limits memory
                 traffic at the source (Core or LLC). MITTS shapes
                 memory traffic based on memory request inter-arrival
                 time, enabling fine-grain bandwidth allocation. In an
                 IaaS system, MITTS enables Cloud customers to express
                 their memory distribution needs and pay commensurately.
                 For instance, MITTS enables charging customers that
                 have bursty memory traffic more than customers with
                 uniform memory traffic for the same aggregate
                 bandwidth. Beyond IaaS systems, MITTS can also be used
                 to optimize for throughput or fairness in a general
                 purpose multi-program workload. MITTS uses an online
                 genetic algorithm to configure hardware bins, which can
                 adapt for program phases and variable input sets. We
                 have implemented MITTS in Verilog and have taped-out
                 the design in a 25-core 32nm processor and find that
                 MITTS requires less than 0.9\% of core area. We
                 evaluate across SPECint, PARSEC, Apache, and bhm Mail
                 Server workloads, and find that MITTS achieves an
                 average 1.18$ \times $ performance gain compared to the
                 best static bandwidth allocation, a 2.69$ \times $
                 average performance/cost advantage in an IaaS setting,
                 and up to 1.17$ \times $ better throughput and 1.52$
                 \times $ better fairness when compared to conventional
                 memory bandwidth provisioning techniques.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '16 conference proceedings.",
}

@Article{SanMiguel:2016:AA,
  author =       "Joshua {San Miguel} and Natalie Enright Jerger",
  title =        "The anytime automaton",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "3",
  pages =        "545--557",
  month =        jun,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3007787.3001195",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:43 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Approximate computing is an emerging paradigm enabling
                 tradeoffs between accuracy and efficiency. However, a
                 fundamental challenge persists: state-of-the-art
                 techniques lack the ability to enforce runtime
                 guarantees on accuracy. The convention is to (1) employ
                 offline or online accuracy models, or (2) present
                 experimental results that demonstrate empirically low
                 error. Unfortunately, these approaches are still unable
                 to guarantee acceptability of all application outputs
                 at runtime. We offer a solution that revisits concepts
                 from anytime algorithms. Originally explored for
                 real-time decision problems, anytime algorithms have
                 the property of producing results with increasing
                 accuracy over time. We propose the Anytime Automaton, a
                 new computation model that executes applications as a
                 parallel pipeline of anytime approximations. An
                 automaton produces approximate versions of the
                 application output with increasing accuracy,
                 guaranteeing that the final precise version is
                 eventually reached. The automaton can be stopped
                 whenever the output is deemed acceptable; otherwise, it
                 is a simple matter of letting it run longer. We present
                 an in-depth analysis of the model and demonstrate
                 attractive runtime-accuracy profiles on various
                 applications. Our anytime automaton is the first step
                 towards systems where the acceptability of an
                 application's output directly governs the amount of
                 time and energy expended.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '16 conference proceedings.",
}

@Article{Wang:2016:AMR,
  author =       "Siyang Wang and Xiangyu Zhang and Yuxuan Li and Ramin
                 Bashizade and Song Yang and Chris Dwyer and Alvin R.
                 Lebeck",
  title =        "Accelerating {Markov} random field inference using
                 molecular optical {Gibbs} sampling units",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "3",
  pages =        "558--569",
  month =        jun,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3007787.3001196",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:43 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "The increasing use of probabilistic algorithms from
                 statistics and machine learning for data analytics
                 presents new challenges and opportunities for the
                 design of computing systems. One important class of
                 probabilistic machine learning algorithms is Markov
                 Chain Monte Carlo (MCMC) sampling, which can be used on
                 a wide variety of applications in Bayesian Inference.
                 However, this probabilistic iterative algorithm can be
                 inefficient in practice on today's processors,
                 especially for problems with high dimensionality and
                 complex structure. The source of inefficiency is
                 generating samples from parameterized probability
                 distributions. This paper seeks to address this
                 sampling inefficiency and presents a new approach to
                 support probabilistic computing that leverages the
                 native randomness of Resonance Energy Transfer (RET)
                 networks to construct RET-based sampling units (RSU).
                 Although RSUs can be designed for a variety of
                 applications, we focus on the specific class of
                 probabilistic problems described as Markov Random Field
                 Inference. Our proposed RSU uses a RET network to
                 implement a molecular-scale optical Gibbs sampling unit
                 (RSU-G) that can be integrated into a processor / GPU
                 as specialized functional units or organized as a
                 discrete accelerator. We experimentally demonstrate the
                 fundamental operation of an RSU using a macro-scale
                 hardware prototype. Emulation-based evaluation of two
                 computer vision applications for HD images reveal that
                 an RSU augmented GPU provides speedups over a GPU of 3
                 and 16. Analytic evaluation shows a discrete
                 accelerator that is limited by 336 GB/s DRAM produces
                 speedups of 21 and 54 versus the GPU implementations.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '16 conference proceedings.",
}

@Article{Huang:2016:EAA,
  author =       "Yipeng Huang and Ning Guo and Mingoo Seok and Yannis
                 Tsividis and Simha Sethumadhavan",
  title =        "Evaluation of an analog accelerator for linear
                 algebra",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "3",
  pages =        "570--582",
  month =        jun,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3007787.3001197",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:43 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Due to the end of supply voltage scaling and the
                 increasing percentage of dark silicon in modern
                 integrated circuits, researchers are looking for new
                 scalable ways to get useful computation from existing
                 silicon technology. In this paper we present a
                 reconfigurable analog accelerator for solving systems
                 of linear equations. Commonly perceived downsides of
                 analog computing, such as low precision and accuracy,
                 limited problem sizes, and difficulty in programming
                 are all compensated for using methods we discuss. Based
                 on a prototyped analog accelerator chip we compare the
                 performance and energy consumption of the analog solver
                 against an efficient digital algorithm running on a
                 CPU, and find that the analog accelerator approach may
                 be an order of magnitude faster and provide one third
                 energy savings, depending on the accelerator design.
                 Due to the speed and efficiency of linear algebra
                 algorithms running on digital computers, an analog
                 accelerator that matches digital performance needs a
                 large silicon footprint. Finally, we conclude that
                 problem classes outside of systems of linear equations
                 may hold more promise for analog acceleration.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '16 conference proceedings.",
}

@Article{Wang:2016:LLA,
  author =       "Jin Wang and Norm Rubin and Albert Sidelnik and
                 Sudhakar Yalamanchili",
  title =        "{LaPerm}: locality aware scheduler for dynamic
                 parallelism on {GPUs}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "3",
  pages =        "583--595",
  month =        jun,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3007787.3001199",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:43 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Recent developments in GPU execution models and
                 architectures have introduced dynamic parallelism to
                 facilitate the execution of irregular applications
                 where control flow and memory behavior can be
                 unstructured, time-varying, and hierarchical. The
                 changes brought about by this extension to the
                 traditional bulk synchronous parallel (BSP) model also
                 creates new challenges in exploiting the current GPU
                 memory hierarchy. One of the major challenges is that
                 the reference locality that exists between the parent
                 and child thread blocks (TBs) created during dynamic
                 nested kernel and thread block launches cannot be fully
                 leveraged using the current TB scheduling strategies.
                 These strategies were designed for the current
                 implementations of the BSP model but fall short when
                 dynamic parallelism is introduced since they are
                 oblivious to the hierarchical reference locality. We
                 propose LaPerm, a new locality-aware TB scheduler that
                 exploits such parent-child locality, both spatial and
                 temporal. LaPerm adopts three different scheduling
                 decisions to (i) prioritize the execution of the child
                 TBs, (ii) bind them to the stream multiprocessors
                 (SMXs) occupied by their parents TBs, and (iii)
                 maintain workload balance across compute units.
                 Experiments with a set of irregular CUDA applications
                 executed on a cycle-level simulator employing dynamic
                 parallelism demonstrate that LaPerm is able to achieve
                 an average of 27\% performance improvement over the
                 baseline round-robin TB scheduler commonly used in
                 modern GPUs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '16 conference proceedings.",
}

@Article{Shahar:2016:ACS,
  author =       "Sagi Shahar and Shai Bergman and Mark Silberstein",
  title =        "{ActivePointers}: a case for software address
                 translation on {GPUs}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "3",
  pages =        "596--608",
  month =        jun,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3007787.3001200",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:43 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Modern discrete GPUs have been the processors of
                 choice for accelerating compute-intensive applications,
                 but using them in large-scale data processing is
                 extremely challenging. Unfortunately, they do not
                 provide important I/O abstractions long established in
                 the CPU context, such as memory mapped files, which
                 shield programmers from the complexity of buffer and
                 I/O device management. However, implementing these
                 abstractions on GPUs poses a problem: the limited GPU
                 virtual memory system provides no address space
                 management and page fault handling mechanisms to GPU
                 developers, and does not allow modifications to memory
                 mappings for running GPU programs. We implement
                 ActivePointers, a software address translation layer
                 and paging system that introduces native support for
                 page faults and virtual address space management to GPU
                 programs, and enables the implementation of fully
                 functional memory mapped files on commodity GPUs. Files
                 mapped into GPU memory are accessed using active
                 pointers, which behave like regular pointers but access
                 the GPU page cache under the hood, and trigger page
                 faults which are handled on the GPU. We design and
                 evaluate a number of novel mechanisms, including a
                 translation cache in hardware registers and translation
                 aggregation for deadlock-free page fault handling of
                 threads in a single warp. We extensively evaluate
                 ActivePointers on commodity NVIDIA GPUs using
                 microbenchmarks, and also implement a complex image
                 processing application that constructs a photo collage
                 from a subset of 10 million images stored in a 40GB
                 file. The GPU implementation maps the entire file into
                 GPU memory and accesses it via active pointers. The use
                 of active pointers adds only up to 1\% to the
                 application's runtime, while enabling speedups of up to
                 3.9$ \times $ over a combined CPU+GPU implementation
                 and 2.6$ \times $ over a 12-core CPU-only
                 implementation which uses AVX vector instructions.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '16 conference proceedings.",
}

@Article{Yoon:2016:VTM,
  author =       "Myung Kuk Yoon and Keunsoo Kim and Sangpil Lee and Won
                 Woo Ro and Murali Annavaram",
  title =        "Virtual thread: maximizing thread-level parallelism
                 beyond {GPU} scheduling limit",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "3",
  pages =        "609--621",
  month =        jun,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3007787.3001201",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:43 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Modern GPUs require tens of thousands of concurrent
                 threads to fully utilize the massive amount of
                 processing resources. However, thread concurrency in
                 GPUs can be diminished either due to shortage of thread
                 scheduling structures (scheduling limit), such as
                 available program counters and single instruction
                 multiple thread stacks, or due to shortage of on-chip
                 memory (capacity limit), such as register file and
                 shared memory. Our evaluations show that in practice
                 concurrency in many general purpose applications
                 running on GPUs is curtailed by the scheduling limit
                 rather than the capacity limit. Maximizing the
                 utilization of on-chip memory resources without unduly
                 increasing the scheduling complexity is a key goal of
                 this paper. This paper proposes a Virtual Thread (VT)
                 architecture which assigns Cooperative Thread Arrays
                 (CTAs) up to the capacity limit, while ignoring the
                 scheduling limit. However, to reduce the logic
                 complexity of managing more threads concurrently, we
                 propose to place CTAs into active and inactive states,
                 such that the number of active CTAs still respects the
                 scheduling limit. When all the warps in an active CTA
                 hit a long latency stall, the active CTA is context
                 switched out and the next ready CTA takes its place. We
                 exploit the fact that both active and inactive CTAs
                 still fit within the capacity limit which obviates the
                 need to save and restore large amounts of CTA state.
                 Thus VT significantly reduces performance penalties of
                 CTA swapping. By swapping between active and inactive
                 states, VT can exploit higher degree of thread level
                 parallelism without increasing logic complexity. Our
                 simulation results show that VT improves performance by
                 23.9\% on average.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '16 conference proceedings.",
}

@Article{Kim:2016:AIE,
  author =       "Jungrae Kim and Michael Sullivan and Sangkug Lym and
                 Mattan Erez",
  title =        "All-inclusive {ECC}: thorough end-to-end protection
                 for reliable computer memory",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "3",
  pages =        "622--633",
  month =        jun,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3007787.3001203",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:43 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Increasing transfer rates and decreasing I/O voltage
                 levels make signals more vulnerable to transmission
                 errors. While the data in computer memory are
                 well-protected by modern error checking and correcting
                 (ECC) codes, the clock, control, command, and address
                 (CCCA) signals are weakly protected or even unprotected
                 such that transmission errors leave serious gaps in
                 data-only protection. This paper presents All-Inclusive
                 ECC (AIECC), a memory protection scheme that leverages
                 and augments data ECC to also thoroughly protect CCCA
                 signals. AIECC provides strong end-to-end protection of
                 memory, detecting nearly 100\% of CCCA errors and also
                 preventing transmission errors from causing latent
                 memory data corruption. AIECC provides these
                 system-level benefits without requiring extra storage
                 and transfer overheads and without degrading the
                 effective level of data protection.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '16 conference proceedings.",
}

@Article{Duwe:2016:RUF,
  author =       "Henry Duwe and Xun Jian and Daniel Petrisko and Rakesh
                 Kumar",
  title =        "Rescuing uncorrectable fault patterns in on-chip
                 memories through error pattern transformation",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "3",
  pages =        "634--644",
  month =        jun,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3007787.3001204",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:43 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Voltage scaling can effectively reduce processor
                 power, but also reduces the reliability of the SRAM
                 cells in on-chip memories. Therefore, it is often
                 accompanied by the use of an error correcting code
                 (ECC). To enable reliable and efficient memory
                 operation at low voltages, ECCs for on-chip memories
                 must provide both high error coverage and low
                 correction latency. In this paper, we propose error
                 pattern transformation, a novel low-latency error
                 correction technique that allows on-chip memories to be
                 scaled to voltages lower than what has been previously
                 possible. Our technique relies on the observation that
                 the number of on-chip memory errors that many ECCs can
                 correct differs widely depending on the error patterns
                 in the logical words they protect. We propose
                 adaptively rearranging the logical bit to physical bit
                 mapping per word according to the BIST-detectable fault
                 pattern in the physical word. The adaptive logical bit
                 to physical bit mapping transforms many uncorrectable
                 error patterns in the logical words into correctable
                 error patterns and, therefore, improving on-chip memory
                 reliability. This reduces the minimum voltage at which
                 on-chip memory can run by 70mV over the best
                 low-latency ECC baseline, leading to a 25.7\% core-wide
                 power reduction for an ARM Cortex-A7-like core. Energy
                 per instruction is reduced by 15.7\% compared to the
                 best baseline.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '16 conference proceedings.",
}

@Article{Kim:2016:RMR,
  author =       "Dong Wan Kim and Mattan Erez",
  title =        "{RelaxFault} memory repair",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "3",
  pages =        "645--657",
  month =        jun,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3007787.3001205",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:43 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Memory system reliability is a serious concern in many
                 systems today, and is becoming more worrisome as
                 technology scales and system size grows. Stronger fault
                 tolerance capability is therefore desirable, but often
                 comes at high cost. In this paper, we propose a
                 low-cost, fault-aware, hardware-only resilience
                 mechanism, RelaxFault, that repairs the vast majority
                 of memory faults using a small amount of the LLC to
                 remap faulty memory locations. RelaxFault requires less
                 than 100KiB of LLC capacity, has near-zero impact on
                 performance and power. By repairing faults, RelaxFault
                 relaxes the requirement for high fault tolerance of
                 other mechanisms, such as ECC. A better tradeoff
                 between resilience and overhead is made by exploiting
                 an understanding of memory system architecture and
                 fault characteristics. We show that RelaxFault provides
                 better repair capability than prior work of similar
                 cost, improves memory reliability to a greater extent,
                 and significantly reduces the number of maintenance
                 events and memory module replacements. We also propose
                 a more refined memory fault model than prior work and
                 demonstrate its importance.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '16 conference proceedings.",
}

@Article{Pothukuchi:2016:UMI,
  author =       "Raghavendra Pradyumna Pothukuchi and Amin Ansari and
                 Petros Voulgaris and Josep Torrellas",
  title =        "Using multiple input, multiple output formal control
                 to maximize resource efficiency in architectures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "3",
  pages =        "658--670",
  month =        jun,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3007787.3001207",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:43 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "As processors seek more resource efficiency, they
                 increasingly need to target multiple goals at the same
                 time, such as a level of performance, power
                 consumption, and average utilization. Robust control
                 solutions cannot come from heuristic-based controllers
                 or even from formal approaches that combine multiple
                 single-parameter controllers. Such controllers may
                 end-up working against each other. What is needed is
                 control-theoretical MIMO (multiple input, multiple
                 output) controllers, which actuate on multiple inputs
                 and control multiple outputs in a coordinated manner.
                 In this paper, we use MIMO control-theory techniques to
                 develop controllers to dynamically tune architectural
                 parameters in processors. To our knowledge, this is the
                 first work in this area. We discuss three ways in which
                 a MIMO controller can be used. We develop an example of
                 MIMO controller and show that it is substantially more
                 effective than controllers based on heuristics or built
                 by combining single-parameter formal controllers. The
                 general approach discussed here is likely to be
                 increasingly relevant as future processors become more
                 resource-constrained and adaptive.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '16 conference proceedings.",
}

@Article{Cherupalli:2016:EDT,
  author =       "Hari Cherupalli and Rakesh Kumar and John Sartori",
  title =        "Exploiting dynamic timing slack for energy efficiency
                 in ultra-low-power embedded systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "3",
  pages =        "671--681",
  month =        jun,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3007787.3001208",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:43 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Many emerging applications such as the internet of
                 things, wearables, and sensor networks have
                 ultra-low-power requirements. At the same time, cost
                 and programmability considerations dictate that many of
                 these applications will be powered by general purpose
                 embedded microprocessors and microcontrollers, not
                 ASICs. In this paper, we exploit a new opportunity for
                 improving energy efficiency in ultra-low-power
                 processors expected to drive these applications ---
                 dynamic timing slack. Dynamic timing slack exists when
                 an embedded software application executed on a
                 processor does not exercise the processor's static
                 critical paths. In such scenarios, the longest path
                 exercised by the application has additional timing
                 slack which can be exploited for power savings at no
                 performance cost by scaling down the processor's
                 voltage at the same frequency until the longest
                 exercised paths just meet timing constraints. Paths
                 that cannot be exercised by an application can safely
                 be allowed to violate timing constraints. We show that
                 dynamic timing slack exists for many ultra-low-power
                 applications and that exploiting dynamic timing slack
                 can result in significant power savings for many
                 ultra-low-power processors. We also present an
                 automated methodology for identifying dynamic timing
                 slack and selecting a safe operating point for a
                 processor and a particular embedded software. Our
                 approach for identifying and exploiting dynamic timing
                 slack is non-speculative, requires no programmer
                 intervention and little or no hardware support, and
                 demonstrates potential power savings of up to 32\%,
                 25\% on average, over a range of embedded applications
                 running on a common ultra-low-power processor, at no
                 performance cost.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '16 conference proceedings.",
}

@Article{Zhou:2016:CSI,
  author =       "Yanqi Zhou and Henry Hoffmann and David Wentzlaff",
  title =        "{CASH}: supporting {IaaS} customers with a sub-core
                 configurable architecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "3",
  pages =        "682--694",
  month =        jun,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3007787.3001209",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:43 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Infrastructure as a Service (IaaS) Clouds have grown
                 increasingly important. Recent architecture designs
                 support IaaS providers through fine-grain
                 configurability, allowing providers to orchestrate
                 low-level resource usage. Little work, however, has
                 been devoted to supporting IaaS customers who must
                 determine how to use such fine-grain configurable
                 resources to meet quality-of-service (QoS) requirements
                 while minimizing cost. This is a difficult problem
                 because the multiplicity of configurations creates a
                 non-convex optimization space. In addition, this
                 optimization space may change as customer applications
                 enter and exit distinct processing phases. In this
                 paper, we overcome these issues by proposing CASH: a
                 fine-grain configurable architecture co-designed with a
                 cost-optimizing runtime system. The hardware
                 architecture enables configurability at the granularity
                 of individual ALUs and L2 cache banks and provides
                 unique interfaces to support low-overhead, dynamic
                 configuration and monitoring. The runtime uses a
                 combination of control theory and machine learning to
                 configure the architecture such that QoS requirements
                 are met and cost is minimized. Our results demonstrate
                 that the combination of fine-grain configurability and
                 non-convex optimization provides tremendous cost
                 savings (70\% savings) compared to coarse-grain
                 heterogeneity and heuristic optimization. In addition,
                 the system is able to customize configurations to
                 particular applications, respond to application phases,
                 and provide near optimal cost for QoS targets.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '16 conference proceedings.",
}

@Article{Arjomand:2016:BAP,
  author =       "Mohammad Arjomand and Mahmut T. Kandemir and Anand
                 Sivasubramaniam and Chita R. Das",
  title =        "Boosting access parallelism to {PCM}-based main
                 memory",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "3",
  pages =        "695--706",
  month =        jun,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3007787.3001211",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:43 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Despite its promise as a DRAM main memory replacement,
                 Phase Change Memory (PCM) has high write latencies
                 which can be a serious detriment to its widespread
                 adoption. Apart from slowing down a write request, the
                 consequent high latency can also keep other chips of
                 the same rank, that are not involved in this write,
                 idle for long times. There are several practical
                 considerations that make it difficult to allow
                 subsequent reads and/or writes to be served
                 concurrently from the same chips during the long
                 latency write. This paper proposes and evaluates
                 several novel mechanisms --- re-constructing data from
                 error correction bits instead of waiting for chips
                 currently busy to serve a read, rotating word mappings
                 across chips of a PCM rank, and rotating the mapping of
                 error detection/correction bits across these chips ---
                 to overlap several reads with an ongoing write (RoW)
                 and even a write with an ongoing write (WoW). The paper
                 also presents the necessary micro-architectural
                 enhancements needed to implement these mechanisms,
                 without significantly changing the current interfaces.
                 The resulting PCM access parallelism (PCMap) system
                 incorporating these enhancements, boosts the
                 intra-rank-level parallelism during such writes from a
                 very low baseline value of 2.4 to an average and
                 maximum values of 4.5 and 7.4, respectively (out of a
                 maximum of 8.0), across a wide spectrum of both
                 multiprogrammed and multithreaded workloads. This boost
                 in parallelism results in an average IPC improvement of
                 15.6\% and 16.7\% for the multiprogrammed and
                 multithreaded workloads, respectively.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '16 conference proceedings.",
}

@Article{Gandhi:2016:APE,
  author =       "Jayneel Gandhi and Mark D. Hill and Michael M. Swift",
  title =        "Agile paging: exceeding the best of nested and shadow
                 paging",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "3",
  pages =        "707--718",
  month =        jun,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3007787.3001212",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:43 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Virtualization provides benefits for many workloads,
                 but the overheads of virtualizing memory are not
                 universally low. The cost comes from managing two
                 levels of address translation---one in the guest
                 virtual machine (VM) and the other in the host virtual
                 machine monitor (VMM)---with either nested or shadow
                 paging. Nested paging directly performs a two-level
                 page walk that makes TLB misses slower than
                 unvirtualized native, but enables fast page tables
                 changes. Alternatively, shadow paging restores native
                 TLB miss speeds, but requires costly VMM intervention
                 on page table updates. This paper proposes agile paging
                 that combines both techniques and exceeds the best of
                 both. A virtualized page walk starts with shadow paging
                 and optionally switches in the same page walk to nested
                 paging where frequent page table updates would cause
                 costly VMM interventions. Agile paging enables most TLB
                 misses to be handled as fast as native while most page
                 table changes avoid VMM intervention. It requires
                 modest changes to hardware (e.g., demark when to
                 switch) and VMM policies (e.g., predict good switching
                 opportunities). We emulate the proposed hardware and
                 prototype the software in Linux with KVM on x86-64.
                 Agile paging performs more than 12\% better than the
                 best of the two techniques and comes within 4\% of
                 native execution for all workloads.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '16 conference proceedings.",
}

@Article{Seol:2016:EED,
  author =       "Hoseok Seol and Wongyu Shin and Jaemin Jang and
                 Jungwhan Choi and Jinwoong Suh and Lee-Sup Kim",
  title =        "Energy efficient data encoding in {DRAM} channels
                 exploiting data value similarity",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "3",
  pages =        "719--730",
  month =        jun,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3007787.3001213",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:43 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "As DRAM data bandwidth increases, tremendous energy is
                 dissipated in the DRAM data bus. To reduce the energy
                 consumed in the data bus, DRAM interfaces with
                 asymmetric termination, such as Pseudo Open Drain (POD)
                 and Low Voltage Swing Terminated Logic (LVSTL), have
                 been adopted in modern DRAMs. In interfaces using
                 asymmetric termination, the amount of termination
                 energy is proportional to the hamming weight of the
                 data words. In this work, we propose Bitwise Difference
                 Encoding (BD-Encoding), which decreases the hamming
                 weight of data words, leading to a reduction in energy
                 consumption in the modern DRAM data bus. Since smaller
                 hamming weight of the data words also reduces switching
                 activity, switching energy and power noise are also
                 both reduced. BD-Encoding exploits the similarity in
                 data words in the DRAM data bus. We observed that
                 similar data words (i.e. data words whose hamming
                 distance is small) are highly likely to be sent over at
                 similar times. Based on this observation, BD-coder
                 stores the data recently sent over in both the memory
                 controller and DRAMs. Then, BD-coder transfers the
                 bitwise difference between the current data and the
                 most similar data. In an evaluation using SPEC 2006,
                 BD-Encoding using 64 recent data reduced termination
                 energy by 58.3\% and switching energy by 45.3\%. In
                 addition, 55\% of the LdI/dt noise was decreased with
                 BD-Encoding.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ISCA '16 conference proceedings.",
}

@Article{Sheng:2016:CCF,
  author =       "Jiayi Sheng and Qingqing Xiong and Chen Yang and
                 Martin C. Herbordt",
  title =        "Collective Communication on {FPGA} Clusters with
                 Static Scheduling",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "4",
  pages =        "2--7",
  month =        sep,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3039902.3039904",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Jun 5 18:01:57 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "FPGA-centric clouds and clusters provide direct and
                 programmable interconnects with obvious benefits for
                 communication latency and bandwidth. One rarely studied
                 aspect of DPI is that they facilitate application-aware
                 routing: if communication patterns are static and known
                 a priori, as is usually the case, then judicious
                 routing can reduce congestion, latency, and the
                 hardware required. In this study we explore applying
                 the method of offline/static routing to collective
                 operations, in particular, multicast and reduction. An
                 entirely new communication infrastructure is proposed
                 and implemented, including switch design and routing
                 algorithm. A substantial improvement in performance is
                 obtained, especially for multicast. We believe that
                 this is one of the few general offline/static routing
                 solutions for real HPC clusters, and FPGA-centric
                 clusters in particular.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "HEART '16 conference proceedings.",
}

@Article{Mashimo:2016:CEH,
  author =       "Susumu Mashimo and Thiem Van Chu and Kenji Kise",
  title =        "Cost-Effective and High-Throughput Merge Network:
                 Architecture for the Fastest {FPGA} Sorting
                 Accelerator",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "4",
  pages =        "8--13",
  month =        sep,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3039902.3039905",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Jun 5 18:01:57 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "High-performance sorting is used in various areas such
                 as database transactions and genomic feature
                 operations. To improve sorting performance, in addition
                 to the conventional approach of using general purpose
                 processors or GPUs, the approach of using FPGAs is
                 becoming a promising solution. As an FPGA sorting
                 accelerator, Casper and Olukotun have recently proposed
                 the fastest one known so far. In their study, they
                 proposed a merge network which can merge two sorted
                 data series at a throughput of 6 data elements per
                 200MHz clock cycle. If an FPGA sorting accelerator is
                 constructed using merge networks, the overall
                 throughput will be mainly determined by the throughputs
                 of the merge networks. This motivates us to design a
                 merge network which outputs more than 6 data elements
                 per 200MHz clock cycle. In this paper, we propose a
                 cost-effective and high-throughput merge network for
                 the fastest FPGA sorting accelerator. The evaluation
                 shows that our proposal achieves a throughput of 8 data
                 elements per 200MHz clock cycle.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "HEART '16 conference proceedings.",
}

@Article{Pham-Quoc:2016:FBM,
  author =       "Cuong Pham-Quoc and Biet Nguyen and Tran Ngoc Thinh",
  title =        "{FPGA}-based Multicore Architecture for Integrating
                 Multiple {DDoS} Defense Mechanisms",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "4",
  pages =        "14--19",
  month =        sep,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3039902.3039906",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Jun 5 18:01:57 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "This paper proposes an FPGA-based multicore
                 architecture to integrate multiple DDoS defense
                 mechanisms for DDoS protection. The architecture allows
                 multiple cooperating DDoS mitigation techniques to
                 classify incoming network packets. The proposed
                 architecture consists of two separate partitions static
                 and dynamic. The static partition includes packet
                 pre-processing and post-processing modules while the
                 DDoS filtering techniques are implemented within the
                 dynamic partition. These filtering techniques can be
                 implemented by either hardware custom computing cores
                 or general purpose soft processors or both. In all
                 cases, these DDoS filtering computing cores can be
                 updated or changed at runtime or design time. We
                 implement our first prototype system with the Hop-count
                 filtering and Ingress/Engress filtering techniques
                 using the Xilinx Virtex 5 xc5vtx240t FPGA device. The
                 synthesis results show that the system can work at up
                 to 116.782MHz while utilizing about 41\% LUTs, 47\%
                 Registers, and 53\% Block Memory of the available
                 hardware resources. Experimental results show that our
                 system achieves a 100\% detection rate (true positive)
                 with a 0\% false negative rate and the maximum 0.74\%
                 false positive rate. Moreover, the prototype system
                 obtains packet processing throughput by up to 9.869
                 Gbps in half-duplex mode and 19.738 Gbps in full-duplex
                 mode.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "HEART '16 conference proceedings.",
}

@Article{Eslami:2016:IOM,
  author =       "Fatemeh Eslami and Steven J. E. Wilton",
  title =        "An Improved Overlay and Mapping Algorithm Supporting
                 Rapid Triggering for {FPGA} Debug",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "4",
  pages =        "20--25",
  month =        sep,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3039902.3039907",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Jun 5 18:01:57 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Embedded system designers can benefit from FPGA
                 accelerators to achieve higher performance and
                 efficiency. However, there are challenges that do not
                 exist in software development; using software
                 simulators to validate large and complex hardware
                 designs can be extremely slow and impractical.
                 Debugging designs implemented on an FPGA enables
                 running the design at speed for long runs and more
                 exhaustive test cases. However, limited observability
                 is the primary challenge in hardware debug. To enhance
                 hardware observability, trace-buffers and a trigger
                 circuitry are inserted into the design. During the
                 device operation, a history of signals of interest is
                 recorded into the trace-buffers for off-line debug and
                 validation. Recompiling the design every time the
                 designer wishes to modify the trigger condition results
                 in long debug turn-around times and reduced
                 productivity. In this work, we present a
                 pre-synthesized overlay fabric and algorithm to enable
                 rapid triggering; during debug turn-around,
                 TriggerPlus, a greedy algorithm, is used to implement a
                 trigger circuit on the overlay. TriggerPlus is fast and
                 simple, yet still capable of mapping the trigger
                 circuit to the overlay fabric. We evaluate our
                 techniques using VPR, showing that using our overlay
                 and mapping algorithm together is at least an order of
                 magnitude faster than the previous work resulting in a
                 significant reduction in debug turn-around times.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "HEART '16 conference proceedings.",
}

@Article{Kobayashi:2016:HSV,
  author =       "Ryohei Kobayashi and Tomohiro Misono and Kenji Kise",
  title =        "A High-speed {Verilog} {HDL} Simulation Method using a
                 Lightweight Translator",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "4",
  pages =        "26--31",
  month =        sep,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3039902.3039908",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Jun 5 18:01:57 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Designing with Hardware Description Languages (HDLs)
                 is still the de facto standard way to develop
                 FPGA-based custom computing systems, and RTL simulation
                 is an important step in ensuring that the designed
                 hardware behavior meets the design specification. In
                 this paper, we propose a new high-speed Verilog HDL
                 simulation method. It is based on two previously
                 proposed techniques: ArchHDL and Pyverilog. ArchHDL is
                 used as a simulation engine in the method because the
                 RTL simulation provided by ArchHDL can be parallelized
                 with OpenMP. We use Pyverilog to develop a code
                 translator to convert Verilog HDL source code into
                 ArchHDL code, and due to this, the translator can be
                 realized and its implementation is lightweight. We
                 compare the proposed method with Synopsys VCS, and the
                 experimental results show that the RTL simulation
                 behavior and speed are same as that of Synopsys VCS and
                 up to 5.8x better respectively.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "HEART '16 conference proceedings.",
}

@Article{Sassa:2016:FSP,
  author =       "Shohei Sassa and Kenji Kanazawa and Shaowei Cai and
                 Moritoshi Yasunaga",
  title =        "An {FPGA} Solver for Partial {MaxSAT} Problems Based
                 on Stochastic Local Search",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "4",
  pages =        "32--37",
  month =        sep,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3039902.3039909",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Jun 5 18:01:57 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "In this paper, we propose an FPGA solver for partial
                 maximum satisfiability (PMS) problems based on the Dist
                 algorithm, which is one of the best performing
                 stochastic local search algorithms for PMS problems.
                 The Dist algorithm searches for a truth assignment for
                 the variables that satisfies all of the hard clauses
                 and as many soft clauses as possible by iteratively
                 selecting a variable using a heuristic and flipping its
                 truth value. During each iteration, new candidate
                 variables for flipping are generated and existing ones
                 may disappear. In our solver, the variables that may
                 become new candidates for flipping are evaluated by
                 parallel and pipeline processing, and then only the
                 variables that actually become the candidates for
                 flipping are extracted and gathered up in concurrent
                 with the pipeline processing. The extraction process is
                 not influenced by the number of the new candidates or
                 their random generation, which minimizes the
                 disturbance of the parallel and pipeline processing.
                 Our FPGA solver can solve large PMS problems up to 7.74
                 times faster than running Dist on CPU.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "HEART '16 conference proceedings.",
}

@Article{Houtgast:2016:EGI,
  author =       "Ernst Joachim Houtgast and VladMihai Sima and Koen
                 Bertels and Zaid AlArs",
  title =        "An Efficient {GPUAccelerated} Implementation of
                 Genomic Short Read Mapping with {BWAMEM}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "4",
  pages =        "38--43",
  month =        sep,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3039902.3039910",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Jun 5 18:01:57 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Next Generation Sequencing techniques have resulted in
                 an exponential growth in the generation of genetics
                 data, the amount of which will soon rival, if not
                 overtake, other Big Data fields, such as astronomy and
                 streaming video services. To become useful, this data
                 requires processing by a complex pipeline of
                 algorithms, taking multiple days even on large
                 clusters. The mapping stage of such genomics pipelines,
                 which maps the short reads onto a reference genome,
                 takes up a significant portion of execution time.
                 BWA-MEM is the de-facto industry-standard for the
                 mapping stage. Here, a GPU-accelerated implementation
                 of BWA-MEM is proposed. The Seed Extension phase, one
                 of the three main BWA-MEM algorithm phases that
                 requires between 30\%-50\% of overall processing time,
                 is offloaded onto the GPU. A thorough design space
                 analysis is presented for an optimized mapping of this
                 phase onto the GPU. The resulting systolic-array based
                 implementation obtains a two-fold overall
                 application-level speedup, which is the maximum
                 theoretically achievable speedup. Moreover, this
                 speedup is sustained for systems with up to twenty-two
                 logical cores. Based on the findings, a number of
                 suggestions are made to improve GPU architecture,
                 resulting in potentially greatly increased performance
                 for bioinformatics-class algorithms.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "HEART '16 conference proceedings.",
}

@Article{Nakahara:2016:FCS,
  author =       "Hiroki Nakahara and Hiroyuki Nakanishi and Kazumasa
                 Iwai and Tsutomu Sasao",
  title =        "An {FFT} Circuit for a Spectrometer of a Radio
                 Telescope using the Nested {RNS} including the Constant
                 Division",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "4",
  pages =        "44--49",
  month =        sep,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3039902.3039911",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Jun 5 18:01:57 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "A radio telescope analyzes radio frequency (RF)
                 received from celestial objects. It consists of an
                 antenna, a receiver, and a spectrometer. The
                 spectrometer converts the time domain into the
                 frequency domain by an FFT operation. This paper
                 applies an FFT circuit based on nested residue number
                 system (NRNS), which recursively decompose the RNS. It
                 can decompose the MAC unit into circuits with small
                 sizes. In the FFT using the NRNS, a MAC unit is
                 decomposed into 4-bit ones realized by look-up tables
                 of the FPGA. Also, to realize the scaling (truncation)
                 circuit, we propose a constant division algorithm on
                 the FPGA. The truncation is realized by the division of
                 a dynamic range for a subset of moduli. We implemented
                 the proposed NRNS FFT on the Xilinx Inc. Virtex 6 FPGA.
                 Compared with a Xilinx Inc. binary FFT library,
                 although the number of block RAMs (BRAMs) was increased
                 by 38\%, in the RNS FFT, the number of LUTs was
                 decreased by 42-45\% and the maximum clock frequency
                 was increased by 38-74\%. With this technique, we
                 successfully implemented an FFT that satisfied the
                 required size and speed specifications on an available
                 FPGA, since the excessive number of LUTs was the
                 bottleneck of the binary FFT.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "HEART '16 conference proceedings.",
}

@Article{Pangracious:2016:NTD,
  author =       "Vinod Pangracious and Mulhim Al-Doori",
  title =        "Novel Three-Dimensional Embedded {FPGA} Technology and
                 Achitecture",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "4",
  pages =        "50--55",
  month =        sep,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3039902.3039912",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Jun 5 18:01:57 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "In this paper we present a high density
                 three-dimensional (3D) interconnect network
                 implementation based on a modified Mesh-of-Trees (MoT)
                 topology for an embedded FPGA architecture design
                 targeted for high performance 3D integration. To obtain
                 the optimal MoT-based interconnect structure, the
                 routing architecture of the 2D MoT-based FPGA is
                 modified to include long routing segments that span
                 multiple switch blocks in every row and column. By
                 adjusting the percentage of long wire and span, a 2.5D
                 or 3D high density MoT-based embedded FPGAs can be
                 designed. For the 3D multi-stacked MoT-based FPGAs, the
                 2D MoTbased FPGA is sliced into two or more equal
                 sections by adjusting the length of the long wire span.
                 The long wire segments are realized using 3D through
                 silicon via (TSVs) and 2.5D interposer-based
                 multi-FPGAs, we increase the number of cuts and apply
                 appropriate optimization models to scale down the
                 number of long wires and horizontal inter-FPGA
                 interposer wires. Using our 2.5/3D CAD models, we
                 demonstrate the speed and area of 3D MoT-based FPGA
                 architecture improved by 54\% and 41\% respectively in
                 comparison to 3D Mesh-based FPGAs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "HEART '16 conference proceedings.",
}

@Article{Knodel:2016:MLR,
  author =       "Oliver Knodel and Paul R. Genssler and Rainer G.
                 Spallek",
  title =        "Migration of long-running Tasks between Reconfigurable
                 Resources using Virtualization",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "4",
  pages =        "56--61",
  month =        sep,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3039902.3039913",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Jun 5 18:01:57 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "Computing performance and scalability are the
                 essential basics in modern data centres. Field
                 Programmable Gate Arrays (FPGAs) provide a promising
                 opportunity to improve performance, security and energy
                 efficiency. Especially background acceleration of
                 computationally complex and long-running tasks is an
                 important field of application. A flexible use of
                 reconfigurable devices within a cloud context requires
                 an abstraction of the actual hardware through
                 virtualization. In this paper we present an approach
                 inspired by paravirtualized machines for the
                 integration of reconfigurable hardware into cloud
                 services. Using partial reconfiguration our hardware
                 and software framework virtualizes a single physical
                 FPGA to enable multiple independent user designs.
                 Essential components are the management of those
                 virtual user-defined accelerators (vFPGA) and their
                 migration between physical FPGAs to achieve higher
                 system-wide utilization. The migration requires saving
                 and restoring the internal state or context of the
                 vFPGA. We demonstrate the application possibilities and
                 the resource trade-off of our approach by transferring
                 a running design from one physical FPGA to another.
                 Moreover, we present future perspectives for the use of
                 FPGAs in cloud-based environments.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "HEART '16 conference proceedings.",
}

@Article{Tada:2016:ESG,
  author =       "Jubee Tada and Maiki Hosokawa and Ryusuke Egawa and
                 Hiroaki Kobayashi",
  title =        "Effects of Stacking Granularity on {$3$-D} Stacked
                 Floating-point Fused Multiply Add Units",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "4",
  pages =        "62--67",
  month =        sep,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3039902.3039914",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Jun 5 18:01:57 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/fparith.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Three-dimensional stacked integrated circuits
                 (3D-SICs) have been expected to overcome the
                 limitations of conventional two-dimensional (2-D)
                 implemented circuits. Since a stacking strategy affects
                 the performance and the power consumption of 3D-SICs,
                 this paper examines two stacking strategies for
                 designing the 3-D stacked floating-point fused
                 multiply-add (FP-FMA) module which contains four FP-FMA
                 units. Experimental results show that a coarse-grain
                 stacking strategy is suitable for reducing critical
                 path delay of the 3-D stacked FP-FMA module. On the
                 other hand, a fine-grain stacking strategy is suitable
                 for reducing power consumption. The 3-D stacked FP-FMA
                 module which is designed based on a fine-grain stacking
                 strategy achieves an 8.4\% critical path delay
                 reduction and an 18\% average power reduction compared
                 with the 2-D implementation.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  keywords =     "fused multiply-add (FMA) instruction",
  remark =       "HEART '16 conference proceedings.",
}

@Article{Su:2016:NNB,
  author =       "Jiang Su and Jianxiong Liu and David B. Thomas and
                 Peter Y. K. Cheung",
  title =        "Neural Network Based Reinforcement Learning
                 Acceleration on {FPGA} Platforms",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "4",
  pages =        "68--73",
  month =        sep,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3039902.3039915",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Jun 5 18:01:57 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Deep Q-learning (DQN) is a recently proposed
                 reinforcement learning algorithm where a neural network
                 is applied as a non-linear approximator to its value
                 function. The exploitation-exploration mechanism allows
                 the training and prediction of the NN to execute
                 simultaneously in an agent during its interaction with
                 the environment. Agents often act independently on
                 battery power, so the training and prediction must
                 occur within the agent and on a limited power budget.
                 In this work, We propose an FPGA acceleration system
                 design for Neural Network Q-learning (NNQL). Our
                 proposed system has high flexibility due to the support
                 to run-time network parameterization, which allows
                 neuroevolution algorithms to dynamically restructure
                 the network to achieve better learning results.
                 Additionally, the power consumption of our proposed
                 system is adaptive to the network size because of a new
                 processing element design. Based on our test cases on
                 networks with hidden layer size ranging from 32 to
                 16384, our proposed system achieves 7x to 346x speedup
                 compared to GPU implementation and 22x to 77x speedup
                 to hand-coded CPU counterpart.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "HEART '16 conference proceedings.",
}

@Article{DHollander:2016:HLS,
  author =       "Erik H. D'Hollander",
  title =        "High-Level Synthesis Optimization for Blocked
                 Floating-Point Matrix Multiplication",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "4",
  pages =        "74--79",
  month =        sep,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3039902.3039916",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Jun 5 18:01:57 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/fparith.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "In the last decade floating-point matrix
                 multiplication on FPGAs has been studied extensively
                 and efficient architectures as well as detailed
                 performance models have been developed. By design these
                 IP cores take a fixed footprint which not necessarily
                 optimizes the use of all available resources. Moreover,
                 the low-level architectures are not easily amenable to
                 a parameterized synthesis. In this paper high-level
                 synthesis is used to fine-tune the configuration
                 parameters in order to achieve the highest performance
                 with maximal resource utilization. An\ exploration
                 strategy is presented to optimize the use of critical
                 resources (DSPs, memory) for any given FPGA. To account
                 for the limited memory size on the FPGA, a
                 block-oriented matrix multiplication is organized such
                 that the block summation is done on the CPU while the
                 block multiplication occurs on the logic fabric
                 simultaneously. The communication overhead between the
                 CPU and the FPGA is minimized by streaming the blocks
                 in a Gray code ordering scheme which maximizes the data
                 reuse for consecutive block matrix product
                 calculations. Using highlevel synthesis optimization,
                 the programmable logic operates at 93\% of the
                 theoretical peak performance and the combined CPU-FPGA
                 design achieves 76\% of the available hardware
                 processing speed for the floating-point multiplication
                 of 2K by 2K matrices.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "HEART '16 conference proceedings.",
}

@Article{Li:2016:FBV,
  author =       "Chengzhe Li and Lai Yoong Yee and Hiroshi Maruyama and
                 Yoshiki Yamaguchi",
  title =        "{FPGA}-based Volleyball Player Tracker",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "4",
  pages =        "80--86",
  month =        sep,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3039902.3039917",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Jun 5 18:01:57 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "The significant challenge facing sport science is how
                 to grasp the flow of the game and analyze the situation
                 of a match. The use of information technology will
                 facilitate to achieve the goal. The technical issues
                 from the practical application perspective can be
                 classified into three main points: computation speed,
                 system size and complex data analysis considering the
                 accuracy. In this paper, for accelerating image
                 recognition and object tracking, we propose a
                 one-dimensional data pipeline architecture on a
                 field-programmable gate array (FPGA). It satisfies both
                 of high-speed streaming computation and small-sized
                 circuits by considering spatiotemporal data dependence.
                 Volleyball games have been chosen as a target
                 application. The proposed system will identify the
                 position of six volleyball players within real time.
                 The design on an FPGA includes pre-processing, color
                 filtering, digitalization, noise reduction, template
                 matching, and so on. The design was implemented and
                 evaluated on Atlys Spartan-6 FPGA Trainer Board with
                 one XILINX Spartan-6 LX45 FPGA. The computational
                 performance achieves 100 frames per second at SVGA 800
                 by 600 pixel resolution. And our design has good
                 scalability; the performance can easily be enhanced
                 when the larger FPGA is used. The proposed system is
                 also compact, which is composed of one Atlys board and
                 one Atlys VmodCAM stereo-camera board. The
                 average-accuracy rates of pregame situation and during
                 a match are 87.1\% and 65.7\%, respectively. Since the
                 input is streaming data, we can improve the accuracy by
                 considering the previous and the next frames. They
                 could be improved to 90.4\% and 72.2\%, respectively,
                 when we adopt template matching with a moving average
                 filter.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "HEART '16 conference proceedings.",
}

@Article{Zhao:2016:SHC,
  author =       "Qian Zhao and Motoki Amagasaki and Masahiro Iida and
                 Morihiro Kuga and Toshinori Sueyoshi",
  title =        "A Study of Heterogeneous Computing Design Method based
                 on Virtualization Technology",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "4",
  pages =        "86--91",
  month =        sep,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3039902.3039918",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Jun 5 18:01:57 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "One challenge for the heterogeneous computing with the
                 FPGA is how to bridge the development gap between SW
                 and HW designs. The high level synthesis (HLS)
                 technique allows producing hardware with high level
                 languages like C. Design tools based on the HLS like
                 Xilinx SDSoC and SDAccel are developed to speedup SW/HW
                 co-designs. However, the developers still require much
                 circuit design skills to use these tools more
                 efficiently. In this paper, we propose a heterogeneous
                 computing platform based on the virtualization
                 technology, namely hCODE.With the help of the
                 virtualization, the HW and SW design can be totally
                 separated. This brings multiple benefits like
                 accelerating a program without modifying or recompiling
                 it, enable high portability and scalability across
                 different HW and operating system.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "HEART '16 conference proceedings.",
}

@Article{Lin:2016:FHL,
  author =       "Colin Yu Lin and Zhenghong Jiang and Cheng Fu and
                 Hayden Kwok-Hay So and Haigang Yang",
  title =        "{FPGA} High-level Synthesis versus Overlay:
                 Comparisons on Computation Kernels",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "4",
  pages =        "92--97",
  month =        sep,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3039902.3039919",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Jun 5 18:01:57 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "To promote FPGA to a wider user community and to
                 increase design productivity, two new design
                 methodologies, namely FPGA high-level synthesis (HLS)
                 and FPGA overlay, are presented to use a high-level
                 design abstraction. To make clear distinguish features
                 of each design methodology, we make an comparison of a
                 state-of-the-art FPGA HLS tool, Vivado HLS, and an FPGA
                 overlay tool, ArchSyn, on two computation intensive
                 kernels, matrix-matrix multiplication and fast Fourier
                 transform. In the comparison, FPGA overlay shows an
                 overwhelming superiority in computation performance,
                 which is 8X to 39X faster than FPGA HLS. However, FPGA
                 HLS exhibits its advantages in dynamic power
                 consumption metric. It achieves up to 17X lower power
                 consumption than FPGA overlay. Power- and
                 energy-efficiency are another two essential metrics
                 evaluating trade-offs between performance and power
                 consumption. As demonstrated with evaluation results,
                 FPGA overlay is averagely 3.5X better in
                 power-efficiency for FFT kernel, and achieves up to 2
                 orders of magnitude better energy-efficiency than FPGA
                 HLS.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "HEART '16 conference proceedings.",
}

@Article{Zhan:2016:PMB,
  author =       "Xusheng Zhan and Yungang Bao and Christian Bienia and
                 Kai Li",
  title =        "{PARSEC3.0}: a Multicore Benchmark Suite with Network
                 Stacks and {SPLASH-2X}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "5",
  pages =        "1--16",
  month =        dec,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3053277.3053279",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Jun 5 18:01:58 MDT 2017",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Benchmarks play a very important role in accelerating
                 the development and research of CMP. As one of them,
                 the PARSEC suite continues to be updated and revised
                 over and over again so that it can offer better support
                 for researchers. The former versions of PARSEC have
                 enough workloads to evaluate the property of CMP about
                 CPU, cache and memory, but it lacks of applications
                 based on network stack to assess the performance of
                 CMPs in respect of network. In this work, we introduce
                 PARSEC3.0, a new version of PARSEC suite that
                 implements a user-level network stack and generates
                 three network workloads with this stack to cover
                 network domain. We explore the input sets of splash-2
                 and expand them to multiple scales, a.k.a, splash-2x.
                 We integrate splash-2 and splash-2x into PARSEC
                 framework so that researchers use these benchmark suite
                 conveniently. Finally, we evaluate the u-TCP/IP stack
                 and new network workloads, and analyze the
                 characterizes of splash-2 and splash-2x",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Chen:2017:BDA,
  author =       "Yunji Chen",
  title =        "Big Data Analytics and Intelligence at Alibaba Cloud",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "1",
  pages =        "1--1",
  month =        mar,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3093337.3037699",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Jun 5 18:01:58 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "As China's largest cloud service provider, Alibaba
                 Cloud has been one of the fastest growing cloud
                 computing platforms in the world. In this talk, I-ll
                 present an overview of Big Data and AI computing
                 platform at Alibaba Cloud, which consists of a wide
                 range of products and services to enable fast and
                 efficient big data development and intelligent
                 analysis. The underlying computing infrastructure
                 supports a variety of computation scenarios, including
                 batch, interactive, stream, and graph computation, as
                 well as large-scale machine learning on heterogeneous
                 cloud-scale data centers. Several big data products,
                 such as rule-based engine, recommendation system, BI
                 tools, etc., are provided to address different business
                 needs. The platform not only supports Alibaba's
                 internal businesses but also provides solid services to
                 enterprise customers. In addition, I'll describe key
                 techniques and system internals, and outline
                 outstanding research and engineering challenges.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'17 conference proceedings",
}

@Article{Cherupalli:2017:DAS,
  author =       "Hari Cherupalli and Henry Duwe and Weidong Ye and
                 Rakesh Kumar and John Sartori",
  title =        "Determining Application-specific Peak Power and Energy
                 Requirements for Ultra-low Power Processors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "1",
  pages =        "3--16",
  month =        mar,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3093337.3037711",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Jun 5 18:01:58 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Many emerging applications such as IoT, wearables,
                 implantables, and sensor networks are power- and
                 energy-constrained. These applications rely on
                 ultra-low-power processors that have rapidly become the
                 most abundant type of processor manufactured today. In
                 the ultra-low-power embedded systems used by these
                 applications, peak power and energy requirements are
                 the primary factors that determine critical system
                 characteristics, such as size, weight, cost, and
                 lifetime. While the power and energy requirements of
                 these systems tend to be application-specific,
                 conventional techniques for rating peak power and
                 energy cannot accurately bound the power and energy
                 requirements of an application running on a processor,
                 leading to over-provisioning that increases system size
                 and weight. In this paper, we present an automated
                 technique that performs hardware-software co-analysis
                 of the application and ultra-low-power processor in an
                 embedded system to determine application-specific peak
                 power and energy requirements. Our technique provides
                 more accurate, tighter bounds than conventional
                 techniques for determining peak power and energy
                 requirements, reporting 15\% lower peak power and 17\%
                 lower peak energy, on average, than a conventional
                 approach based on profiling and guardbanding. Compared
                 to an aggressive stressmark-based approach, our
                 technique reports power and energy bounds that are 26\%
                 and 26\% lower, respectively, on average. Also, unlike
                 conventional approaches, our technique reports
                 guaranteed bounds on peak power and energy independent
                 of an application's input set. Tighter bounds on peak
                 power and energy can be exploited to reduce system
                 size, weight, and cost.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'17 conference proceedings",
}

@Article{Chen:2017:PPQ,
  author =       "Quan Chen and Hailong Yang and Minyi Guo and Ram
                 Srivatsa Kannan and Jason Mars and Lingjia Tang",
  title =        "{Prophet}: Precise {QoS} Prediction on Non-Preemptive
                 Accelerators to Improve Utilization in Warehouse-Scale
                 Computers",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "1",
  pages =        "17--32",
  month =        mar,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3093337.3037700",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Jun 5 18:01:58 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Guaranteeing Quality-of-Service (QoS) of
                 latency-sensitive applications while improving server
                 utilization through application co-location is
                 important yet challenging in modern datacenters. The
                 key challenge is that when applications are co-located
                 on a server, performance interference due to resource
                 contention can be detrimental to the application QoS.
                 Although prior work has proposed techniques to identify
                 ``safe'' co-locations where application QoS is
                 satisfied by predicting the performance interference on
                 multicores, no such prediction technique on
                 accelerators such as GPUs. In this work, we present
                 Prophet, an approach to precisely predict the
                 performance degradation of latency-sensitive
                 applications on accelerators due to application
                 co-location. We analyzed the performance interference
                 on accelerators through a real system investigation and
                 found that unlike on multicores where the key
                 contentious resources are shared caches and main memory
                 bandwidth, the key contentious resources on
                 accelerators are instead processing elements,
                 accelerator memory bandwidth and PCIe bandwidth. Based
                 on this observation, we designed interference models
                 that enable the precise prediction for processing
                 element, accelerator memory bandwidth and PCIe
                 bandwidth contention on real hardware. By using a novel
                 technique to forecast solo-run execution traces of the
                 co-located applications using interference models,
                 Prophet can accurately predict the performance
                 degradation of latency-sensitive applications on
                 non-preemptive accelerators. Using Prophet, we can
                 identify ``safe'' co-locations on accelerators to
                 improve utilization without violating the QoS target.
                 Our evaluation shows that Prophet can predict the
                 performance degradation with an average prediction
                 error 5.47\% on real systems. Meanwhile, based on the
                 prediction, Prophet achieves accelerator utilization
                 improvements of 49.9\% on average while maintaining the
                 QoS target of latency-sensitive applications.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'17 conference proceedings",
}

@Article{Kanev:2017:MAM,
  author =       "Svilen Kanev and Sam Likun Xi and Gu-Yeon Wei and
                 David Brooks",
  title =        "{Mallacc}: Accelerating Memory Allocation",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "1",
  pages =        "33--45",
  month =        mar,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3093337.3037736",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Jun 5 18:01:58 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Recent work shows that dynamic memory allocation
                 consumes nearly 7\% of all cycles in Google
                 datacenters. With the trend towards increased
                 specialization of hardware, we propose Mallacc, an
                 in-core hardware accelerator designed for broad use
                 across a number of high-performance, modern memory
                 allocators. The design of Mallacc is quite different
                 from traditional throughput-oriented hardware
                 accelerators. Because memory allocation requests tend
                 to be very frequent, fast, and interspersed inside
                 other application code, accelerators must be optimized
                 for latency rather than throughput and area overheads
                 must be kept to a bare minimum. Mallacc accelerates the
                 three primary operations of a typical memory allocation
                 request: size class computation, retrieval of a free
                 memory block, and sampling of memory usage. Our results
                 show that malloc latency can be reduced by up to 50\%
                 with a hardware cost of less than 1500 um2 of silicon
                 area, less than 0.006\% of a typical high-performance
                 processor core.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'17 conference proceedings",
}

@Article{Wen:2017:REV,
  author =       "Shasha Wen and Milind Chabbi and Xu Liu",
  title =        "{REDSPY}: Exploring Value Locality in Software",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "1",
  pages =        "47--61",
  month =        mar,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3093337.3037729",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Jun 5 18:01:58 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Complex code bases with several layers of abstractions
                 have abundant inefficiencies that affect the execution
                 time. Value redundancy is a kind of inefficiency where
                 the same values are repeatedly computed, stored, or
                 retrieved over the course of execution. Not all
                 redundancies can be easily detected or eliminated with
                 compiler optimization passes due to the inherent
                 limitations of the static analysis. Microscopic
                 observation of whole executions at instruction- and
                 operand-level granularity breaks down abstractions and
                 helps recognize redundancies that masquerade in complex
                 programs. We have developed REDSPY---a fine-grained
                 profiler to pinpoint and quantify redundant operations
                 in program executions. Value redundancy may happen over
                 time at same locations or in adjacent locations, and
                 thus it has temporal and spatial locality. REDSPY
                 identifies both temporal and spatial value locality.
                 Furthermore, REDSPY is capable of identifying values
                 that are approximately the same, enabling optimization
                 opportunities in HPC codes that often use floating
                 point computations. REDSPY provides intuitive
                 optimization guidance by apportioning redundancies to
                 their provenance---source lines and execution calling
                 contexts. REDSPY pinpointed dramatically high volume of
                 redundancies in programs that were optimization targets
                 for decades, such as SPEC CPU2006 suite, Rodinia
                 benchmark, and NWChem---a production computational
                 chemistry code. Guided by REDSPY, we were able to
                 eliminate redundancies that resulted in significant
                 speedups.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'17 conference proceedings",
}

@Article{Bhattacharjee:2017:TTP,
  author =       "Abhishek Bhattacharjee",
  title =        "Translation-Triggered Prefetching",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "1",
  pages =        "63--76",
  month =        mar,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3093337.3037705",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Jun 5 18:01:58 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "We propose translation-enabled memory prefetching
                 optimizations or TEMPO, a low-overhead hardware
                 mechanism to boost memory performance by exploiting the
                 operating system's (OS) virtual memory subsystem. We
                 are the first to make the following observations: (1) a
                 substantial fraction (20-40\%) of DRAM references in
                 modern big-data workloads are devoted to accessing page
                 tables; and (2) when memory references require page
                 table lookups in DRAM, the vast majority of them
                 (98\%+) also look up DRAM for the subsequent data
                 access. TEMPO exploits these observations to enable
                 DRAM row-buffer and on-chip cache prefetching of the
                 data that page tables point to. TEMPO requires trivial
                 changes to the memory controller (under 3\% additional
                 area), no OS or application changes, and improves
                 performance by 10-30\% and energy by 1-14\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'17 conference proceedings",
}

@Article{Kim:2017:TAA,
  author =       "Channoh Kim and Jaehyeok Kim and Sungmin Kim and
                 Dooyoung Kim and Namho Kim and Gitae Na and Young H. Oh
                 and Hyeon Gyu Cho and Jae W. Lee",
  title =        "Typed Architectures: Architectural Support for
                 Lightweight Scripting",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "1",
  pages =        "77--90",
  month =        mar,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3093337.3037726",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Jun 5 18:01:58 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Dynamic scripting languages are becoming more and more
                 widely adopted not only for fast prototyping but also
                 for developing production-grade applications. They
                 provide high-productivity programming environments
                 featuring high levels of abstraction with powerful
                 built-in functions, automatic memory management,
                 object-oriented programming paradigm and dynamic
                 typing. However, their flexible, dynamic type systems
                 easily become the source of inefficiency in terms of
                 instruction count, memory footprint, and energy
                 consumption. This overhead makes it challenging to
                 deploy these high-productivity programming technologies
                 on emerging single-board computers for IoT
                 applications. Addressing this challenge, this paper
                 introduces Typed Architectures, a high-efficiency,
                 low-cost execution substrate for dynamic scripting
                 languages, where each data variable retains high-level
                 type information at an ISA level. Typed Architectures
                 calculate and check the dynamic type of each variable
                 implicitly in hardware, rather than explicitly in
                 software, hence significantly reducing instruction
                 count for dynamic type checking. Besides, Typed
                 Architectures introduce polymorphic instructions (e.g.,
                 xadd), which are bound to the correct native
                 instruction at runtime within the pipeline (e.g., add
                 or fadd) to efficiently implement polymorphic
                 operators. Finally, Typed Architectures provide
                 hardware support for flexible yet efficient type tag
                 extraction and insertion, capturing common data layout
                 patterns of tag-value pairs. Our evaluation using a
                 fully synthesizable RISC-V RTL design on FPGA shows
                 that Typed Architectures achieve geomean speedups of
                 11.2\% and 9.9\% with maximum speedups of 32.6\% and
                 43.5\% for two production-grade scripting engines for
                 JavaScript and Lua, respectively. Moreover, Typed
                 Architectures improve the energy-delay product (EDP) by
                 19.3\% for JavaScript and 16.5\% for Lua with an area
                 overhead of 1.6\% at a 40nm technology node.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'17 conference proceedings",
}

@Article{Seo:2017:FAS,
  author =       "Jihye Seo and Wook-Hee Kim and Woongki Baek and
                 Beomseok Nam and Sam H. Noh",
  title =        "Failure-Atomic Slotted Paging for Persistent Memory",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "1",
  pages =        "91--104",
  month =        mar,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3093337.3037737",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Jun 5 18:01:58 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "The slotted-page structure is a database page format
                 commonly used for managing variable-length records. In
                 this work, we develop a novel ``failure-atomic slotted
                 page structure'' for persistent memory that leverages
                 byte addressability and durability of persistent memory
                 to minimize redundant write operations used to maintain
                 consistency in traditional database systems.
                 Failure-atomic slotted paging consists of two key
                 elements: (i) in-place commit per page using hardware
                 transactional memory and (ii) slot header logging that
                 logs the commit mark of each page. The proposed scheme
                 is implemented in SQLite and compared against NVWAL,
                 the current state-of-the-art scheme. Our performance
                 study shows that our failure-atomic slotted paging
                 shows optimal performance for database transactions
                 that insert a single record. For transactions that
                 touch more than one database page, our proposed
                 slot-header logging scheme minimizes the logging
                 overhead by avoiding duplicating pages and logging only
                 the metadata of the dirty pages. Overall, we find that
                 our failure-atomic slotted-page management scheme
                 reduces database logging overhead to 1/6 and improves
                 query response time by up to 33\% compared to NVWAL.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'17 conference proceedings",
}

@Article{Nguyen:2017:WSP,
  author =       "Donald Nguyen and Keshav Pingali",
  title =        "What Scalable Programs Need from Transactional
                 Memory",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "1",
  pages =        "105--118",
  month =        mar,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3093337.3037750",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Jun 5 18:01:58 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Transactional memory (TM) has been the focus of
                 numerous studies, and it is supported in processors
                 such as the IBM Blue Gene/Q and Intel Haswell. Many
                 studies have used the STAMP benchmark suite to evaluate
                 their designs. However, the speedups obtained for the
                 STAMP benchmarks on all TM systems we know of are quite
                 limited; for example, with 64 threads on the IBM Blue
                 Gene/Q, we observe a median speedup of 1.4X using the
                 Blue Gene/Q hardware transactional memory (HTM), and a
                 median speedup of 4.1X using a software transactional
                 memory (STM). What limits the performance of these
                 benchmarks on TMs? In this paper, we argue that the
                 problem lies with the programming model and data
                 structures used to write them. To make this point, we
                 articulate two principles that we believe must be
                 embodied in any scalable program and argue that STAMP
                 programs violate both of them. By modifying the STAMP
                 programs to satisfy both principles, we produce a new
                 set of programs that we call the Stampede suite. Its
                 median speedup on the Blue Gene/Q is 8.0X when using an
                 STM. The two principles also permit us to simplify the
                 TM design. Using this new STM with the Stampede
                 benchmarks, we obtain a median speedup of 17.7X with 64
                 threads on the Blue Gene/Q and 13.2X with 32 threads on
                 an Intel Westmere system. These results suggest that
                 HTM and STM designs will benefit if more attention is
                 paid to the division of labor between application
                 programs, systems software, and hardware.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'17 conference proceedings",
}

@Article{Trippel:2017:TMM,
  author =       "Caroline Trippel and Yatin A. Manerkar and Daniel
                 Lustig and Michael Pellauer and Margaret Martonosi",
  title =        "{TriCheck}: Memory Model Verification at the
                 Trisection of Software, Hardware, and {ISA}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "1",
  pages =        "119--133",
  month =        mar,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3093337.3037719",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Jun 5 18:01:58 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Memory consistency models (MCMs) which govern
                 inter-module interactions in a shared memory system,
                 are a significant, yet often under-appreciated, aspect
                 of system design. MCMs are defined at the various
                 layers of the hardware-software stack, requiring
                 thoroughly verified specifications, compilers, and
                 implementations at the interfaces between layers.
                 Current verification techniques evaluate segments of
                 the system stack in isolation, such as proving compiler
                 mappings from a high-level language (HLL) to an ISA or
                 proving validity of a microarchitectural implementation
                 of an ISA. This paper makes a case for full-stack MCM
                 verification and provides a toolflow, TriCheck, capable
                 of verifying that the HLL, compiler, ISA, and
                 implementation collectively uphold MCM requirements.
                 The work showcases TriCheck's ability to evaluate a
                 proposed ISA MCM in order to ensure that each layer and
                 each mapping is correct and complete. Specifically, we
                 apply TriCheck to the open source RISC-V ISA [55],
                 seeking to verify accurate, efficient, and legal
                 compilations from C11. We uncover under-specifications
                 and potential inefficiencies in the current RISC-V ISA
                 documentation and identify possible solutions for each.
                 As an example, we find that a RISC-V-compliant
                 microarchitecture allows 144 outcomes forbidden by C11
                 to be observed out of 1,701 litmus tests examined.
                 Overall, this paper demonstrates the necessity of
                 full-stack verification for detecting MCM-related bugs
                 in the hardware-software stack.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'17 conference proceedings",
}

@Article{Nalli:2017:APM,
  author =       "Sanketh Nalli and Swapnil Haria and Mark D. Hill and
                 Michael M. Swift and Haris Volos and Kimberly Keeton",
  title =        "An Analysis of Persistent Memory Use with {WHISPER}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "1",
  pages =        "135--148",
  month =        mar,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3093337.3037730",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Jun 5 18:01:58 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Emerging non-volatile memory (NVM) technologies
                 promise durability with read and write latencies
                 comparable to volatile memory (DRAM). We define
                 Persistent Memory (PM) as NVM accessed with byte
                 addressability at low latency via normal memory
                 instructions. Persistent-memory applications ensure the
                 consistency of persistent data by inserting ordering
                 points between writes to PM allowing the construction
                 of higher-level transaction mechanisms. An epoch is a
                 set of writes to PM between ordering points. To put
                 systems research in PM on a firmer footing, we
                 developed and analyzed a PM benchmark suite called
                 WHISPER (Wisconsin-HP Labs Suite for Persistence) that
                 comprises ten PM applications we gathered to cover all
                 current interfaces to PM. A quantitative analysis
                 reveals several insights: (a) only 4\% of writes in
                 PM-aware applications are to PM and the rest are to
                 volatile memory, (b) software transactions are often
                 implemented with 5 to 50 ordering points (c) 75\% of
                 epochs update exactly one 64B cache line, (d) 80\% of
                 epochs from the same thread depend on previous epochs
                 from the same thread, while few epochs depend on epochs
                 from other threads. Based on our analysis, we propose
                 the Hands-off Persistence System (HOPS) to track
                 updates to PM in hardware. Current hardware design
                 requires applications to force data to PM as each epoch
                 ends. HOPS provides high-level ISA primitives for
                 applications to express durability and ordering
                 constraints separately and enforces them automatically,
                 while achieving 24.3\% better performance over current
                 approaches to persistence.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'17 conference proceedings",
}

@Article{Zhang:2017:PPD,
  author =       "Tong Zhang and Changhee Jung and Dongyoon Lee",
  title =        "{ProRace}: Practical Data Race Detection for
                 Production Use",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "1",
  pages =        "149--162",
  month =        mar,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3093337.3037708",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Jun 5 18:01:58 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "This paper presents ProRace, a dynamic data race
                 detector practical for production runs. It is
                 lightweight, but still offers high race detection
                 capability. To track memory accesses, ProRace leverages
                 instruction sampling using the performance monitoring
                 unit (PMU) in commodity processors. Our PMU driver
                 enables ProRace to sample more memory accesses at a
                 lower cost compared to the state-of-the-art Linux
                 driver. Moreover, ProRace uses PMU-provided execution
                 contexts including register states and program path,
                 and reconstructs unsampled memory accesses offline.
                 This technique allows \ProRace to overcome inherent
                 limitations of sampling and improve the detection
                 coverage by performing data race detection on the trace
                 with not only sampled but also reconstructed memory
                 accesses. Experiments using racy production software
                 including apache and mysql shows that, with a
                 reasonable offline cost, ProRace incurs only 2.6\%
                 overhead at runtime with 27.5\% detection probability
                 with a sampling period of 10,000.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'17 conference proceedings",
}

@Article{Olson:2017:CGM,
  author =       "Lena E. Olson and Mark D. Hill and David A. Wood",
  title =        "Crossing Guard: Mediating Host-Accelerator Coherence
                 Interactions",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "1",
  pages =        "163--176",
  month =        mar,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3093337.3037715",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Jun 5 18:01:58 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Specialized hardware accelerators have performance and
                 energy-efficiency advantages over general-purpose
                 processors. To fully realize these benefits and aid
                 programmability, accelerators may share a physical and
                 virtual address space and full cache coherence with the
                 host system. However, allowing accelerators ---
                 particularly those designed by third parties --- to
                 directly communicate with host coherence protocols
                 poses several problems. Host coherence protocols are
                 complex, vary between companies, and may be
                 proprietary, increasing burden on accelerator
                 designers. Bugs in the accelerator implementation may
                 cause crashes and other serious consequences to the
                 host system. We propose Crossing Guard, a coherence
                 interface between the host coherence system and
                 accelerators. The Crossing Guard interface provides the
                 accelerator designer with a standardized set of
                 coherence messages that are simple enough to aid in
                 design of bug-free coherent caches. At the same time,
                 they are sufficiently complex to allow customized and
                 optimized accelerator caches with performance
                 comparable to using the host protocol. The Crossing
                 Guard hardware is implemented as part of the trusted
                 host, and provides complete safety to the host
                 coherence system, even in the presence of a
                 pathologically buggy accelerator cache.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'17 conference proceedings",
}

@Article{McMahan:2017:ASF,
  author =       "Joseph McMahan and Michael Christensen and Lawton
                 Nichols and Jared Roesch and Sung-Yee Guo and Ben
                 Hardekopf and Timothy Sherwood",
  title =        "An Architecture Supporting Formal and Compositional
                 Binary Analysis",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "1",
  pages =        "177--191",
  month =        mar,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3093337.3037733",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Jun 5 18:01:58 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Building a trustworthy life-critical embedded system
                 requires deep reasoning about the potential effects
                 that sequences of machine instructions can have on full
                 system operation. Rather than trying to analyze
                 complete binaries and the countless ways their
                 instructions can interact with one another --- memory,
                 side effects, control registers, implicit state, etc.
                 --- we explore a new approach. We propose an
                 architecture controlled by a thin computational layer
                 designed to tightly correspond with the lambda
                 calculus, drawing on principles of functional
                 programming to bring the assembly much closer to myriad
                 reasoning frameworks, such as the Coq proof assistant.
                 This approach allows assembly-level verified versions
                 of critical code to operate safely in tandem with
                 arbitrary code, including imperative and unverified
                 system components, without the need for large
                 supporting trusted computing bases. We demonstrate that
                 this computational layer can be built in such a way as
                 to simultaneously provide full programmability and
                 compact, precise, and complete semantics, while still
                 using hardware resources comparable to normal embedded
                 systems. To demonstrate the practicality of this
                 approach, our FPGA-implemented prototype runs an
                 embedded medical application which monitors and treats
                 life-threatening arrhythmias. Though the system
                 integrates untrusted and imperative components, our
                 architecture allows for the formal verification of
                 multiple properties of the end-to-end system, including
                 a proof of correctness of the assembly-level
                 implementation of the core algorithm, the integrity of
                 trusted data via a non-interference proof, and a
                 guarantee that our prototype meets critical timing
                 requirements.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'17 conference proceedings",
}

@Article{Hsiao:2017:ASI,
  author =       "Chun-Hung Hsiao and Satish Narayanasamy and Essam
                 Muhammad Idris Khan and Cristiano L. Pereira and Gilles
                 A. Pokam",
  title =        "{AsyncClock}: Scalable Inference of Asynchronous Event
                 Causality",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "1",
  pages =        "193--205",
  month =        mar,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3093337.3037712",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Jun 5 18:01:58 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Asynchronous programming model is commonly used in
                 mobile systems and Web 2.0 environments. Asynchronous
                 race detectors use algorithms that are an order of
                 magnitude performance and space inefficient compared to
                 conventional data race detectors. We solve this problem
                 by identifying and addressing two important problems in
                 reasoning about causality between asynchronous events.
                 Unlike conventional signal-wait operations,
                 establishing causal order between two asynchronous
                 events is fundamentally more challenging as there is no
                 common handle they operate on. We propose a new
                 primitive named AsyncClock that addresses this problem
                 by explicitly tracking causally preceding events, and
                 show that AsyncClock can handle a wide variety of
                 asynchronous causality models. We also address the
                 important scalability problem of efficiently
                 identifying heirless events whose metadata can be
                 reclaimed. We built the first single-pass,
                 non-graph-based Android race detector using our
                 algorithm and applied it to find errors in 20 popular
                 applications. Our tool incurs about 6x performance
                 overhead, which is several times more efficient than
                 the state-of-the-art solution. It also scales well with
                 the execution length. We used our tool to find 147
                 previously unknown harmful races.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'17 conference proceedings",
}

@Article{Calciu:2017:BBC,
  author =       "Irina Calciu and Siddhartha Sen and Mahesh
                 Balakrishnan and Marcos K. Aguilera",
  title =        "Black-box Concurrent Data Structures for {NUMA}
                 Architectures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "1",
  pages =        "207--221",
  month =        mar,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3093337.3037721",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Jun 5 18:01:58 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "High-performance servers are Non-Uniform Memory Access
                 (NUMA) machines. To fully leverage these machines,
                 programmers need efficient concurrent data structures
                 that are aware of the NUMA performance artifacts. We
                 propose Node Replication (NR), a black-box approach to
                 obtaining such data structures. NR takes an arbitrary
                 sequential data structure and automatically transforms
                 it into a NUMA-aware concurrent data structure
                 satisfying linearizability. Using NR requires no
                 expertise in concurrent data structure design, and the
                 result is free of concurrency bugs. NR draws ideas from
                 two disciplines: shared-memory algorithms and
                 distributed systems. Briefly, NR implements a
                 NUMA-aware shared log, and then uses the log to
                 replicate data structures consistently across NUMA
                 nodes. NR is best suited for contended data structures,
                 where it can outperform lock-free algorithms by 3.1x,
                 and lock-based solutions by 30x. To show the benefits
                 of NR to a real application, we apply NR to the data
                 structures of Redis, an in-memory storage system. The
                 result outperforms other methods by up to 14x. The cost
                 of NR is additional memory for its log and replicas.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'17 conference proceedings",
}

@Article{Vora:2017:CCR,
  author =       "Keval Vora and Chen Tian and Rajiv Gupta and Ziang
                 Hu",
  title =        "{CoRAL}: Confined Recovery in Distributed Asynchronous
                 Graph Processing",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "1",
  pages =        "223--236",
  month =        mar,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3093337.3037747",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Jun 5 18:01:58 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Existing distributed asynchronous graph processing
                 systems employ checkpointing to capture globally
                 consistent snapshots and rollback all machines to most
                 recent checkpoint to recover from machine failures. In
                 this paper we argue that recovery in distributed
                 asynchronous graph processing does not require the
                 entire execution state to be rolled back to a globally
                 consistent state due to the relaxed asynchronous
                 execution semantics. We define the properties required
                 in the recovered state for it to be usable for correct
                 asynchronous processing and develop CoRAL, a
                 lightweight checkpointing and recovery algorithm.
                 First, this algorithm carries out confined recovery
                 that only rolls back graph execution states of the
                 failed machines to affect recovery. Second, it relies
                 upon lightweight checkpoints that capture locally
                 consistent snapshots with a reduced peak network
                 bandwidth requirement. Our experiments using real-world
                 graphs show that our technique recovers from failures
                 and finishes processing 1.5x to 3.2x faster compared to
                 the traditional asynchronous checkpointing and recovery
                 mechanism when failures impact 1 to 6 machines of a 16
                 machine cluster. Moreover, capturing locally consistent
                 snapshots significantly reduces intermittent high peak
                 bandwidth usage required to save the snapshots --- the
                 average reduction in 99th percentile bandwidth ranges
                 from 22\% to 51\% while 1 to 6 snapshot replicas are
                 being maintained.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'17 conference proceedings",
}

@Article{Vora:2017:KFA,
  author =       "Keval Vora and Rajiv Gupta and Guoqing Xu",
  title =        "{KickStarter}: Fast and Accurate Computations on
                 Streaming Graphs via Trimmed Approximations",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "1",
  pages =        "237--251",
  month =        mar,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3093337.3037748",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Jun 5 18:01:58 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Continuous processing of a streaming graph maintains
                 an approximate result of the iterative computation on a
                 recent version of the graph. Upon a user query, the
                 accurate result on the current graph can be quickly
                 computed by feeding the approximate results to the
                 iterative computation --- a form of incremental
                 computation that corrects the (small amount of) error
                 in the approximate result. Despite the effectiveness of
                 this approach in processing growing graphs, it is
                 generally not applicable when edge deletions are
                 present --- existing approximations can lead to either
                 incorrect results (e.g., monotonic computations
                 terminate at an incorrect minima/maxima) or poor
                 performance (e.g., with approximations, convergence
                 takes longer than performing the computation from
                 scratch). This paper presents KickStarter, a runtime
                 technique that can trim the approximate values for a
                 subset of vertices impacted by the deleted edges. The
                 trimmed approximation is both safe and profitable,
                 enabling the computation to produce correct results and
                 converge quickly. KickStarter works for a class of
                 monotonic graph algorithms and can be readily
                 incorporated in any existing streaming graph system.
                 Our experiments with four streaming algorithms on five
                 large graphs demonstrate that trimming not only
                 produces correct results but also accelerates these
                 algorithms by 8.5--23.7x.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'17 conference proceedings",
}

@Article{Powers:2017:BBG,
  author =       "Bobby Powers and John Vilk and Emery D. Berger",
  title =        "{Browsix}: Bridging the Gap Between {Unix} and the
                 Browser",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "1",
  pages =        "253--266",
  month =        mar,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3093337.3037727",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Jun 5 18:01:58 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib;
                 https://www.math.utah.edu/pub/tex/bib/unix.bib",
  abstract =     "Applications written to run on conventional operating
                 systems typically depend on OS abstractions like
                 processes, pipes, signals, sockets, and a shared file
                 system. Porting these applications to the web currently
                 requires extensive rewriting or hosting significant
                 portions of code server-side because browsers present a
                 nontraditional runtime environment that lacks OS
                 functionality. This paper presents Browsix, a framework
                 that bridges the considerable gap between conventional
                 operating systems and the browser, enabling unmodified
                 programs expecting a Unix-like environment to run
                 directly in the browser. Browsix comprises two core
                 parts: (1) a JavaScript-only system that makes core
                 Unix features (including pipes, concurrent processes,
                 signals, sockets, and a shared file system) available
                 to web applications; and (2) extended JavaScript
                 runtimes for C, C++, Go, and Node.js that support
                 running programs written in these languages as
                 processes in the browser. Browsix supports running a
                 POSIX shell, making it straightforward to connect
                 applications together via pipes. We illustrate
                 Browsix's capabilities via case studies that
                 demonstrate how it eases porting legacy applications to
                 the browser and enables new functionality. We
                 demonstrate a Browsix-enabled LaTeX editor that
                 operates by executing unmodified versions of pdfLaTeX
                 and BibTeX. This browser-only LaTeX editor can render
                 documents in seconds, making it fast enough to be
                 practical. We further demonstrate how Browsix lets us
                 port a client-server application to run entirely in the
                 browser for disconnected operation. Creating these
                 applications required less than 50 lines of glue code
                 and no code modifications, demonstrating how easily
                 Browsix can be used to build sophisticated web
                 applications from existing parts without
                 modification.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'17 conference proceedings",
}

@Article{Rajbhandari:2017:OCM,
  author =       "Samyam Rajbhandari and Yuxiong He and Olatunji Ruwase
                 and Michael Carbin and Trishul Chilimbi",
  title =        "Optimizing {CNNs} on Multicores for Scalability,
                 Performance and Goodput",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "1",
  pages =        "267--280",
  month =        mar,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3093337.3037745",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Jun 5 18:01:58 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Convolutional Neural Networks (CNN) are a class of
                 Artificial Neural Networks (ANN) that are highly
                 efficient at the pattern recognition tasks that
                 underlie difficult AI problems in a variety of domains,
                 such as speech recognition, object recognition, and
                 natural language processing. CNNs are, however,
                 computationally intensive to train. This paper presents
                 the first characterization of the performance
                 optimization opportunities for training CNNs on CPUs.
                 Our characterization includes insights based on the
                 structure of the network itself (i.e., intrinsic
                 arithmetic intensity of the convolution and its
                 scalability under parallelism) as well as dynamic
                 properties of its execution (i.e., sparsity of the
                 computation). Given this characterization, we present
                 an automatic framework called spg-CNN for optimizing
                 CNN training on CPUs. It comprises of a computation
                 scheduler for efficient parallel execution, and two
                 code generators: one that optimizes for sparsity, and
                 the other that optimizes for spatial reuse in
                 convolutions. We evaluate spg-CNN using convolutions
                 from a variety of real world benchmarks, and show that
                 spg-CNN can train CNNs faster than state-of-the-art
                 approaches by an order of magnitude.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'17 conference proceedings",
}

@Article{Sundararajah:2017:LTN,
  author =       "Kirshanthan Sundararajah and Laith Sakka and Milind
                 Kulkarni",
  title =        "Locality Transformations for Nested Recursive
                 Iteration Spaces",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "1",
  pages =        "281--295",
  month =        mar,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3093337.3037720",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Jun 5 18:01:58 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "There has been a significant amount of effort invested
                 in designing scheduling transformations such as loop
                 tiling and loop fusion that rearrange the execution of
                 dynamic instances of loop nests to place operations
                 that access the same data close together temporally. In
                 recent years, there has been interest in designing
                 similar transformations that operate on recursive
                 programs, but until now these transformations have only
                 considered simple scenarios: multiple recursions to be
                 fused, or a recursion nested inside a simple loop. This
                 paper develops the first set of scheduling
                 transformations for nested recursions: recursive
                 methods that call other recursive methods. These are
                 the recursive analog to nested loops. We present a
                 transformation called recursion twisting that
                 automatically improves locality at all levels of the
                 memory hierarchy, and show that this transformation can
                 yield substantial performance improvements across
                 several benchmarks that exhibit nested recursion.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'17 conference proceedings",
}

@Article{Li:2017:LAC,
  author =       "Ang Li and Shuaiwen Leon Song and Weifeng Liu and Xu
                 Liu and Akash Kumar and Henk Corporaal",
  title =        "Locality-Aware {CTA} Clustering for Modern {GPUs}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "1",
  pages =        "297--311",
  month =        mar,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3093337.3037709",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Jun 5 18:01:58 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Cache is designed to exploit locality; however, the
                 role of on-chip L1 data caches on modern GPUs is often
                 awkward. The locality among global memory requests from
                 different SMs (Streaming Multiprocessors) is
                 predominantly harvested by the commonly-shared L2 with
                 long access latency; while the in-core locality, which
                 is crucial for performance delivery, is handled
                 explicitly by user-controlled scratchpad memory. In
                 this work, we disclose another type of data locality
                 that has been long ignored but with performance
                 boosting potential --- the inter-CTA locality.
                 Exploiting such locality is rather challenging due to
                 unclear hardware feasibility, unknown and inaccessible
                 underlying CTA scheduler, and small in-core cache
                 capacity. To address these issues, we first conduct a
                 thorough empirical exploration on various modern GPUs
                 and demonstrate that inter-CTA locality can be
                 harvested, both spatially and temporally, on L1 or
                 L1/Tex unified cache. Through further quantification
                 process, we prove the significance and commonality of
                 such locality among GPU applications, and discuss
                 whether such reuse is exploitable. By leveraging these
                 insights, we propose the concept of CTA-Clustering and
                 its associated software-based techniques to reshape the
                 default CTA scheduling in order to group the CTAs with
                 potential reuse together on the same SM. Our techniques
                 require no hardware modification and can be directly
                 deployed on existing GPUs. In addition, we incorporate
                 these techniques into an integrated framework for
                 automatic inter-CTA locality optimization. We evaluate
                 our techniques using a wide range of popular GPU
                 applications on all modern generations of NVIDIA GPU
                 architectures. The results show that our proposed
                 techniques significantly improve cache performance
                 through reducing L2 cache transactions by 55\%, 65\%,
                 29\%, 28\% on average for Fermi, Kepler, Maxwell and
                 Pascal, respectively, leading to an average of 1.46x,
                 1.48x, 1.45x, 1.41x (up to 3.8x, 3.6x, 3.1x, 3.3x)
                 performance speedups for applications with
                 algorithm-related inter-CTA reuse.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'17 conference proceedings",
}

@Article{Churchill:2017:SLS,
  author =       "Berkeley Churchill and Rahul Sharma and JF Bastien and
                 Alex Aiken",
  title =        "Sound Loop Superoptimization for {Google Native
                 Client}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "1",
  pages =        "313--326",
  month =        mar,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3093337.3037754",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Jun 5 18:01:58 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Software fault isolation (SFI) is an important
                 technique for the construction of secure operating
                 systems, web browsers, and other extensible software.
                 We demonstrate that superoptimization can dramatically
                 improve the performance of Google Native Client, a SFI
                 system that ships inside the Google Chrome Browser. Key
                 to our results are new techniques for superoptimization
                 of loops: we propose a new architecture for
                 superoptimization tools that incorporates both a fully
                 sound verification technique to ensure correctness and
                 a bounded verification technique to guide the search to
                 optimized code. In our evaluation we optimize 13 libc
                 string functions, formally verify the correctness of
                 the optimizations and report a median and average
                 speedup of 25\% over the libraries shipped by Google.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'17 conference proceedings",
}

@Article{Bianchini:2017:IDE,
  author =       "Ricardo Bianchini",
  title =        "Improving Datacenter Efficiency",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "1",
  pages =        "327--327",
  month =        mar,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3093337.3046426",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Jun 5 18:01:58 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Internet companies can improve datacenter efficiency
                 and reduce costs, by minimizing resource waste while
                 avoiding (or limiting) performance degradation. In this
                 talk, I will first overview a few of the
                 efficiency-related efforts we are undertaking at
                 Microsoft, including leveraging workload history to
                 improve resource management. I will then discuss some
                 lessons from deploying these efforts in production and
                 how they relate to academic research.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'17 conference proceedings",
}

@Article{Liu:2017:DBD,
  author =       "Mengxing Liu and Mingxing Zhang and Kang Chen and
                 Xuehai Qian and Yongwei Wu and Weimin Zheng and Jinglei
                 Ren",
  title =        "{DudeTM}: Building Durable Transactions with
                 Decoupling for Persistent Memory",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "1",
  pages =        "329--343",
  month =        mar,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3093337.3037714",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Jun 5 18:01:58 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Emerging non-volatile memory (NVM) offers
                 non-volatility, byte-addressability and fast access at
                 the same time. To make the best use of these
                 properties, it has been shown by empirical evidence
                 that programs should access NVM directly through CPU
                 load and store instructions, so that the overhead of a
                 traditional file system or database can be avoided.
                 Thus, durable transactions become a common choice of
                 applications for accessing persistent memory data in a
                 crash consistent manner. However, existing durable
                 transaction systems employ either undo logging, which
                 requires a fence for every memory write, or redo
                 logging, which requires intercepting all memory reads
                 within transactions. This paper presents DUDETM, a
                 crash-consistent durable transaction system that avoids
                 the drawbacks of both undo logging and redo logging.
                 DUDETM uses shadow DRAM to decouple the execution of a
                 durable transaction into three fully asynchronous
                 steps. The advantage is that only minimal fences and no
                 memory read instrumentation are required. This design
                 also enables an out-of-the-box transactional memory
                 (TM) to be used as an independent component in our
                 system. The evaluation results show that DUDETM adds
                 durability to a TM system with only 7.4 ~ 24.6\%
                 throughput degradation. Compared to the existing
                 durable transaction systems, DUDETM provides 1.7times
                 to 4.4times higher throughput. Moreover, DUDETM can be
                 implemented with existing hardware TMs with minor
                 hardware modifications, leading to a further 1.7times
                 speedup.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'17 conference proceedings",
}

@Article{Klimovic:2017:RRF,
  author =       "Ana Klimovic and Heiner Litz and Christos Kozyrakis",
  title =        "{ReFlex}: Remote Flash $ \approx $ Local Flash",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "1",
  pages =        "345--359",
  month =        mar,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3093337.3037732",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Jun 5 18:01:58 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Remote access to NVMe Flash enables flexible scaling
                 and high utilization of Flash capacity and IOPS within
                 a datacenter. However, existing systems for remote
                 Flash access either introduce significant performance
                 overheads or fail to isolate the multiple remote
                 clients sharing each Flash device. We present ReFlex, a
                 software-based system for remote Flash access, that
                 provides nearly identical performance to accessing
                 local Flash. ReFlex uses a dataplane kernel to closely
                 integrate networking and storage processing to achieve
                 low latency and high throughput at low resource
                 requirements. Specifically, ReFlex can serve up to 850K
                 IOPS per core over TCP/IP networking, while adding 21us
                 over direct access to local Flash. ReFlex uses a QoS
                 scheduler that can enforce tail latency and throughput
                 service-level objectives (SLOs) for thousands of remote
                 clients. We show that ReFlex allows applications to use
                 remote Flash while maintaining their original
                 performance with local Flash.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'17 conference proceedings",
}

@Article{Jevdjic:2017:ASC,
  author =       "Djordje Jevdjic and Karin Strauss and Luis Ceze and
                 Henrique S. Malvar",
  title =        "Approximate Storage of Compressed and Encrypted
                 Videos",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "1",
  pages =        "361--373",
  month =        mar,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3093337.3037718",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Jun 5 18:01:58 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/datacompression.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "The popularization of video capture devices has
                 created strong storage demand for encoded videos.
                 Approximate storage can ease this demand by enabling
                 denser storage at the expense of occasional errors.
                 Unfortunately, even minor storage errors, such as bit
                 flips, can result in major visual damage in encoded
                 videos. Similarly, video encryption, widely employed
                 for privacy and digital rights management, may create
                 long dependencies between bits that show little or no
                 tolerance to storage errors. In this paper we propose
                 VideoApp, a novel and efficient methodology to compute
                 bit-level reliability requirements for encoded videos
                 by tracking visual and metadata dependencies within
                 encoded bitstreams. We further show how VideoApp can be
                 used to trade video quality for storage density in an
                 optimal way. We integrate our methodology into a
                 popular H.264 encoder to partition an encoded video
                 stream into multiple streams that can receive different
                 levels of error correction according to their
                 reliability needs. When applied to a dense and highly
                 error-prone multi-level cell storage substrate, our
                 variable error correction mechanism reduces the error
                 correction overhead by half under the most
                 error-intolerant encoder settings, achieving
                 quality/density points that neither compression nor
                 approximation can achieve alone. Finally, we define the
                 basic invariants needed to support encrypted
                 approximate video storage. We present an analysis of
                 block cipher modes of operation, showing that some are
                 fully compatible with approximation, enabling
                 approximate and secure video storage systems.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'17 conference proceedings",
}

@Article{Elyasi:2017:EIR,
  author =       "Nima Elyasi and Mohammad Arjomand and Anand
                 Sivasubramaniam and Mahmut T. Kandemir and Chita R. Das
                 and Myoungsoo Jung",
  title =        "Exploiting Intra-Request Slack to Improve {SSD}
                 Performance",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "1",
  pages =        "375--388",
  month =        mar,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3093337.3037728",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Jun 5 18:01:58 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "With Solid State Disks (SSDs) offering high degrees of
                 parallelism, SSD controllers place data and direct
                 requests to exploit the maximum offered hardware
                 parallelism. In the quest to maximize parallelism and
                 utilization, sub-requests of a request that are
                 directed to different flash chips by the scheduler can
                 experience differential wait times since their
                 individual queues are not coordinated and load balanced
                 at all times. Since the macro request is considered
                 complete only when its last sub-request completes, some
                 of its sub-requests that complete earlier have to
                 necessarily wait for this last sub-request. This paper
                 opens the door to a new class of schedulers to leverage
                 such slack between sub-requests in order to improve
                 response times. Specifically, the paper presents the
                 design and implementation of a slack-enabled
                 re-ordering scheduler, called Slacker, for sub-requests
                 issued to each flash chip. Layered under a modern SSD
                 request scheduler, Slacker estimates the slack of each
                 incoming sub-request to a flash chip and allows them to
                 jump ahead of existing sub-requests with sufficient
                 slack so as to not detrimentally impact their response
                 times. Slacker is simple to implement and imposes only
                 marginal additions to the hardware. Using a spectrum of
                 21 workloads with diverse read-write characteristics,
                 we show that Slacker provides as much as 19.5\%, 13\%
                 and 14.5\% improvement in response times, with average
                 improvements of 12\%, 6.5\% and 8.5\%, for
                 write-intensive, read-intensive and read-write balanced
                 workloads, respectively.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'17 conference proceedings",
}

@Article{Wang:2017:GSM,
  author =       "Kai Wang and Aftab Hussain and Zhiqiang Zuo and
                 Guoqing Xu and Ardalan Amiri Sani",
  title =        "{Graspan}: a Single-machine Disk-based Graph System
                 for Interprocedural Static Analyses of Large-scale
                 Systems Code",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "1",
  pages =        "389--404",
  month =        mar,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3093337.3037744",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Jun 5 18:01:58 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "There is more than a decade-long history of using
                 static analysis to find bugs in systems such as Linux.
                 Most of the existing static analyses developed for
                 these systems are simple checkers that find bugs based
                 on pattern matching. Despite the presence of many
                 sophisticated interprocedural analyses, few of them
                 have been employed to improve checkers for systems code
                 due to their complex implementations and poor
                 scalability. In this paper, we revisit the scalability
                 problem of interprocedural static analysis from a ``Big
                 Data'' perspective. That is, we turn sophisticated code
                 analysis into Big Data analytics and leverage novel
                 data processing techniques to solve this traditional
                 programming language problem. We develop Graspan, a
                 disk-based parallel graph system that uses an edge-pair
                 centric computation model to compute dynamic transitive
                 closures on very large program graphs. We implement
                 context-sensitive pointer/alias and dataflow analyses
                 on Graspan. An evaluation of these analyses on large
                 codebases such as Linux shows that their Graspan
                 implementations scale to millions of lines of code and
                 are much simpler than their original implementations.
                 Moreover, we show that these analyses can be used to
                 augment the existing checkers; these augmented checkers
                 uncovered 132 new NULL pointer bugs and 1308
                 unnecessary NULL tests in Linux 4.4.0-rc5, PostgreSQL
                 8.3.9, and Apache httpd 2.2.18.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'17 conference proceedings",
}

@Article{Ren:2017:SDH,
  author =       "Ao Ren and Zhe Li and Caiwen Ding and Qinru Qiu and
                 Yanzhi Wang and Ji Li and Xuehai Qian and Bo Yuan",
  title =        "{SC-DCNN}: Highly-Scalable Deep Convolutional Neural
                 Network using Stochastic Computing",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "1",
  pages =        "405--418",
  month =        mar,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3093337.3037746",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Jun 5 18:01:58 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "With the recent advance of wearable devices and
                 Internet of Things (IoTs), it becomes attractive to
                 implement the Deep Convolutional Neural Networks
                 (DCNNs) in embedded and portable systems. Currently,
                 executing the software-based DCNNs requires
                 high-performance servers, restricting the widespread
                 deployment on embedded and mobile IoT devices. To
                 overcome this obstacle, considerable research efforts
                 have been made to develop highly-parallel and
                 specialized DCNN accelerators using GPGPUs, FPGAs or
                 ASICs. Stochastic Computing (SC), which uses a
                 bit-stream to represent a number within [-1, 1] by
                 counting the number of ones in the bit-stream, has high
                 potential for implementing DCNNs with high scalability
                 and ultra-low hardware footprint. Since multiplications
                 and additions can be calculated using AND gates and
                 multiplexers in SC, significant reductions in power
                 (energy) and hardware footprint can be achieved
                 compared to the conventional binary arithmetic
                 implementations. The tremendous savings in power
                 (energy) and hardware resources allow immense design
                 space for enhancing scalability and robustness for
                 hardware DCNNs. This paper presents SC-DCNN, the first
                 comprehensive design and optimization framework of
                 SC-based DCNNs, using a bottom-up approach. We first
                 present the designs of function blocks that perform the
                 basic operations in DCNN, including inner product,
                 pooling, and activation function. Then we propose four
                 designs of feature extraction blocks, which are in
                 charge of extracting features from input feature maps,
                 by connecting different basic function blocks with
                 joint optimization. Moreover, the efficient weight
                 storage methods are proposed to reduce the area and
                 power (energy) consumption. Putting all together, with
                 feature extraction blocks carefully selected, SC-DCNN
                 is holistically optimized to minimize area and power
                 (energy) consumption while maintaining high network
                 accuracy. Experimental results demonstrate that the
                 LeNet5 implemented in SC-DCNN consumes only 17 mm$^2$
                 area and 1.53 W power, achieves throughput of 781250
                 images/s, area efficiency of 45946 images/s/ mm$^2$,
                 and energy efficiency of 510734 images/J.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'17 conference proceedings",
}

@Article{Ajay:2017:GIL,
  author =       "Jerry Ajay and Chen Song and Aditya Singh Rathore and
                 Chi Zhou and Wenyao Xu",
  title =        "{$3$DGates}: an Instruction-Level Energy Analysis and
                 Optimization of {$3$D} Printers",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "1",
  pages =        "419--433",
  month =        mar,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3093337.3037752",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Jun 5 18:01:58 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "As the next-generation manufacturing driven force, 3D
                 printing technology is having a transformative effect
                 on various industrial domains and has been widely
                 applied in a broad spectrum of applications. It also
                 progresses towards other versatile fields with portable
                 battery-powered 3D printers working on a limited energy
                 budget. While reducing manufacturing energy is an
                 essential challenge in industrial sustainability and
                 national economics, this growing trend motivates us to
                 explore the energy consumption of the 3D printer for
                 the purpose of energy efficiency. To this end, we
                 perform an in-depth analysis of energy consumption in
                 commercial, off-the-shelf 3D printers from an
                 instruction-level perspective. We build an
                 instruction-level energy model and an energy profiler
                 to analyze the energy cost during the fabrication
                 process. From the insights obtained by the energy
                 profiler, we propose and implement a cross-layer energy
                 optimization solution, called 3DGates, which spans the
                 instruction-set, the compiler and the firmware. We
                 evaluate 3DGates over 338 benchmarks on a 3D printer
                 and achieve an overall energy reduction of 25\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'17 conference proceedings",
}

@Article{Cox:2017:EAT,
  author =       "Guilherme Cox and Abhishek Bhattacharjee",
  title =        "Efficient Address Translation for Architectures with
                 Multiple Page Sizes",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "1",
  pages =        "435--448",
  month =        mar,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3093337.3037704",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Jun 5 18:01:58 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Processors and operating systems (OSes) support
                 multiple memory page sizes. Superpages increase
                 Translation Lookaside Buffer (TLB) hits, while small
                 pages provide fine-grained memory protection. Ideally,
                 TLBs should perform well for any distribution of page
                 sizes. In reality, set-associative TLBs --- used
                 frequently for their energy efficiency compared to
                 fully-associative TLBs --- cannot (easily) support
                 multiple page sizes concurrently. Instead, commercial
                 systems typically implement separate set-associative
                 TLBs for different page sizes. This means that when
                 superpages are allocated aggressively, TLB misses may,
                 counter intuitively, increase even if entries for small
                 pages remain unused (and vice-versa). We invent MIX
                 TLBs, energy-frugal set-associative structures that
                 concurrently support all page sizes by exploiting
                 superpage allocation patterns. MIX TLBs boost the
                 performance (often by 10-30\%) of big-memory
                 applications on native CPUs, virtualized CPUs, and
                 GPUs. MIX TLBs are simple and require no OS or program
                 changes.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'17 conference proceedings",
}

@Article{Lesokhin:2017:PFS,
  author =       "Ilya Lesokhin and Haggai Eran and Shachar Raindel and
                 Guy Shapiro and Sagi Grimberg and Liran Liss and Muli
                 Ben-Yehuda and Nadav Amit and Dan Tsafrir",
  title =        "Page Fault Support for Network Controllers",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "1",
  pages =        "449--466",
  month =        mar,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3093337.3037710",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Jun 5 18:01:58 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Direct network I/O allows network controllers (NICs)
                 to expose multiple instances of themselves, to be used
                 by untrusted software without a trusted intermediary.
                 Direct I/O thus frees researchers from legacy software,
                 fueling studies that innovate in multitenant setups.
                 Such studies, however, overwhelmingly ignore one
                 serious problem: direct memory accesses (DMAs) of NICs
                 disallow page faults, forcing systems to either pin
                 entire address spaces to physical memory and thereby
                 hinder memory utilization, or resort to APIs that
                 pin/unpin memory buffers before/after they are DMAed,
                 which complicates the programming model and hampers
                 performance. We solve this problem by designing and
                 implementing page fault support for InfiniBand and
                 Ethernet NICs. A main challenge we tackle---unique to
                 NICs---is handling receive DMAs that trigger page
                 faults, leaving the NIC without memory to store the
                 incoming data. We demonstrate that our solution
                 provides all the benefits associated with ``regular''
                 virtual memory, notably (1) a simpler programming model
                 that rids users from the need to pin, and (2) the
                 ability to employ all the canonical memory
                 optimizations, such as memory overcommitment and
                 demand-paging based on actual use. We show that, as a
                 result, benchmark performance improves by up to 1.9x.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'17 conference proceedings",
}

@Article{Hu:2017:TFC,
  author =       "Yang Hu and Mingcong Song and Tao Li",
  title =        "Towards {``Full Containerization''} in Containerized
                 Network Function Virtualization",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "1",
  pages =        "467--481",
  month =        mar,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3093337.3037713",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Jun 5 18:01:58 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "With exploding traffic stuffing existing network
                 infra-structure, today's telecommunication and cloud
                 service providers resort to Network Function
                 Virtualization (NFV) for greater agility and economics.
                 Pioneer service provider such as AT{\&}T proposes to
                 adopt container in NFV to achieve shorter Virtualized
                 Network Function (VNF) provisioning time and better
                 runtime performance. However, we characterize typical
                 NFV work-loads on the containers and find that the
                 performance is unsatisfactory. We observe that the
                 shared host OS net-work stack is the main bottleneck,
                 where the traffic flow processing involves a large
                 amount of intermediate memory buffers and results in
                 significant last level cache pollution. Existing OS
                 memory allocation policies fail to exploit the locality
                 and data sharing information among buffers. In this
                 paper, we propose NetContainer, a software framework
                 that achieves fine-grained hardware resource management
                 for containerized NFV platform. NetContainer employs a
                 cache access overheads guided page coloring scheme to
                 coordinately address the inter-flow cache access
                 overheads and intra-flow cache access overheads. It
                 maps the memory buffer pages that manifest low cache
                 access overheads (across a flow or among the flows) to
                 the same last level cache partition. NetContainer
                 exploits a footprint theory based method to estimate
                 the cache access overheads and a Min-Cost Max-Flow
                 model to guide the memory buffer mappings. We implement
                 the NetContainer in Linux kernel and extensively
                 evaluate it with real NFV workloads. Experimental
                 results show that NetContainer outperforms conventional
                 page coloring-based memory allocator by 48\% in terms
                 of successful call rate.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'17 conference proceedings",
}

@Article{Wu:2017:FEF,
  author =       "Bo Wu and Xu Liu and Xiaobo Zhou and Changjun Jiang",
  title =        "{FLEP}: Enabling Flexible and Efficient Preemption on
                 {GPUs}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "1",
  pages =        "483--496",
  month =        mar,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3093337.3037742",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Jun 5 18:01:58 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "GPUs are widely adopted in HPC and cloud computing
                 platforms to accelerate general-purpose workloads.
                 However, modern GPUs do not support flexible
                 preemption, leading to performance and priority
                 inversion problems in multi-tasking environments. In
                 this paper, we propose and develop FLEP, the first
                 software system that enables flexible kernel preemption
                 and kernel scheduling on commodity GPUs. The FLEP
                 compilation engine transforms the GPU program into
                 preemptable forms, which can be interrupted during
                 execution and yield all or part of the streaming
                 multi-processors (SMs) in the GPU. The FLEP runtime
                 engine intercepts all kernel invocations and determines
                 which kernels and how those kernels should be preempted
                 and scheduled. Experimental results on two-kernel
                 co-runs demonstrate up to 24.2X speedup for
                 high-priority kernels and up to 27X improvement on
                 normalized average turnaround time for kernels with the
                 same priority. FLEP reduces the preemption latency by
                 up to 41\% compared to yielding the whole GPU when the
                 waiting kernels only need several SMs. With all the
                 benefits, FLEP only introduces 2.5\% runtime overhead,
                 which is substantially lower than the kernel slicing
                 approach.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'17 conference proceedings",
}

@Article{Li:2017:SSA,
  author =       "Kaiwei Li and Jianfei Chen and Wenguang Chen and Jun
                 Zhu",
  title =        "{SaberLDA}: Sparsity-Aware Learning of Topic Models on
                 {GPUs}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "1",
  pages =        "497--509",
  month =        mar,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3093337.3037740",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Jun 5 18:01:58 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Latent Dirichlet Allocation (LDA) is a popular tool
                 for analyzing discrete count data such as text and
                 images. Applications require LDA to handle both large
                 datasets and a large number of topics. Though
                 distributed CPU systems have been used, GPU-based
                 systems have emerged as a promising alternative because
                 of the high computational power and memory bandwidth of
                 GPUs. However, existing GPU-based LDA systems cannot
                 support a large number of topics because they use
                 algorithms on dense data structures whose time and
                 space complexity is linear to the number of topics. In
                 this paper, we propose SaberLDA, a GPU-based LDA system
                 that implements a sparsity-aware algorithm to achieve
                 sublinear time complexity and scales well to learn a
                 large number of topics. To address the challenges
                 introduced by sparsity, we propose a novel data layout,
                 a new warp-based sampling kernel, and an efficient
                 sparse count matrix updating algorithm that improves
                 locality, makes efficient utilization of GPU warps, and
                 reduces memory consumption. Experiments show that
                 SaberLDA can learn from billions-token-scale data with
                 up to 10,000 topics, which is almost two orders of
                 magnitude larger than that of the previous GPU-based
                 systems. With a single GPU card, SaberLDA is able to
                 learn 10,000 topics from a dataset of billions of
                 tokens in a few hours, which is only achievable with
                 clusters with tens of machines before.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'17 conference proceedings",
}

@Article{Khazraee:2017:MNO,
  author =       "Moein Khazraee and Lu Zhang and Luis Vega and Michael
                 Bedford Taylor",
  title =        "{Moonwalk}: {NRE} Optimization in {ASIC} Clouds",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "1",
  pages =        "511--526",
  month =        mar,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3093337.3037749",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Jun 5 18:01:58 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Cloud services are becoming increasingly globalized
                 and data-center workloads are expanding exponentially.
                 GPU and FPGA-based clouds have illustrated improvements
                 in power and performance by accelerating
                 compute-intensive workloads. ASIC-based clouds are a
                 promising way to optimize the Total Cost of Ownership
                 (TCO) of a given datacenter computation (e.g. YouTube
                 transcoding) by reducing both energy consumption and
                 marginal computation cost. The feasibility of an ASIC
                 Cloud for a particular application is directly gated by
                 the ability to manage the Non-Recurring Engineering
                 (NRE) costs of designing and fabricating the ASIC, so
                 that it is significantly lower (e.g. 2X) than the TCO
                 of the best available alternative. In this paper, we
                 show that technology node selection is a major tool for
                 managing ASIC Cloud NRE, and allows the designer to
                 trade off an accelerator's excess energy efficiency and
                 cost performance for lower total cost. We explore NRE
                 and cross-technology optimization of ASIC Clouds for
                 four different applications: Bitcoin mining,
                 YouTube-style video transcoding, Litecoin, and Deep
                 Learning. We address these challenges and show large
                 reductions in the NRE, potentially enabling ASIC Clouds
                 to address a wider variety of datacenter workloads. Our
                 results suggest that advanced nodes like 16nm will lead
                 to sub-optimal TCO for many workloads, and that use of
                 older nodes like 65nm can enable a greater diversity of
                 ASIC Clouds.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'17 conference proceedings",
}

@Article{Park:2017:DRM,
  author =       "Jason Jong Kyu Park and Yongjun Park and Scott
                 Mahlke",
  title =        "Dynamic Resource Management for Efficient Utilization
                 of Multitasking {GPUs}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "1",
  pages =        "527--540",
  month =        mar,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3093337.3037707",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Jun 5 18:01:58 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "As graphics processing units (GPUs) are broadly
                 adopted, running multiple applications on a GPU at the
                 same time is beginning to attract wide attention.
                 Recent proposals on multitasking GPUs have focused on
                 either spatial multitasking, which partitions GPU
                 resource at a streaming multiprocessor (SM)
                 granularity, or simultaneous multikernel (SMK), which
                 runs multiple kernels on the same SM. However,
                 multitasking performance varies heavily depending on
                 the resource partitions within each scheme, and the
                 application mixes. In this paper, we propose GPU
                 Maestro that performs dynamic resource management for
                 efficient utilization of multitasking GPUs. GPU Maestro
                 can discover the best performing GPU resource partition
                 exploiting both spatial multitasking and SMK.
                 Furthermore, dynamism within a kernel and interference
                 between the kernels are automatically considered
                 because GPU Maestro finds the best performing partition
                 through direct measurements. Evaluations show that GPU
                 Maestro can improve average system throughput by 20.2\%
                 and 13.9\% over the baseline spatial multitasking and
                 SMK, respectively.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'17 conference proceedings",
}

@Article{Zhang:2017:ISC,
  author =       "Rui Zhang and Natalie Stanley and Christopher Griggs
                 and Andrew Chi and Cynthia Sturton",
  title =        "Identifying Security Critical Properties for the
                 Dynamic Verification of a Processor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "1",
  pages =        "541--554",
  month =        mar,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3093337.3037734",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Jun 5 18:01:58 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "We present a methodology for identifying security
                 critical properties for use in the dynamic verification
                 of a processor. Such verification has been shown to be
                 an effective way to prevent exploits of vulnerabilities
                 in the processor, given a meaningful set of security
                 properties. We use known processor errata to establish
                 an initial set of security-critical invariants of the
                 processor. We then use machine learning to infer an
                 additional set of invariants that are not tied to any
                 particular, known vulnerability, yet are critical to
                 security. We build a tool chain implementing the
                 approach and evaluate it for the open-source OR1200
                 RISC processor. We find that our tool can identify 19
                 (86.4\%) of the 22 manually crafted security-critical
                 properties from prior work and generates 3 new security
                 properties not covered in prior work.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'17 conference proceedings",
}

@Article{Ferraiuolo:2017:VPH,
  author =       "Andrew Ferraiuolo and Rui Xu and Danfeng Zhang and
                 Andrew C. Myers and G. Edward Suh",
  title =        "Verification of a Practical Hardware Security
                 Architecture Through Static Information Flow Analysis",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "1",
  pages =        "555--568",
  month =        mar,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3093337.3037739",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Jun 5 18:01:58 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Hardware-based mechanisms for software isolation are
                 becoming increasingly popular, but implementing these
                 mechanisms correctly has proved difficult, undermining
                 the root of security. This work introduces an effective
                 way to formally verify important properties of such
                 hardware security mechanisms. In our approach, hardware
                 is developed using a lightweight security-typed
                 hardware description language (HDL) that performs
                 static information flow analysis. We show the
                 practicality of our approach by implementing and
                 verifying a simplified but realistic multi-core
                 prototype of the ARM TrustZone architecture. To make
                 the security-typed HDL expressive enough to verify a
                 realistic processor, we develop new type system
                 features. Our experiments suggest that information flow
                 analysis is efficient, and programmer effort is modest.
                 We also show that information flow constraints are an
                 effective way to detect hardware vulnerabilities,
                 including several found in commercial processors.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'17 conference proceedings",
}

@Article{Chisnall:2017:CJS,
  author =       "David Chisnall and Brooks Davis and Khilan Gudka and
                 David Brazdil and Alexandre Joannou and Jonathan
                 Woodruff and A. Theodore Markettos and J. Edward Maste
                 and Robert Norton and Stacey Son and Michael Roe and
                 Simon W. Moore and Peter G. Neumann and Ben Laurie and
                 Robert N. M. Watson",
  title =        "{CHERI JNI}: Sinking the {Java} Security Model into
                 the {C}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "1",
  pages =        "569--583",
  month =        mar,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3093337.3037725",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Jun 5 18:01:58 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Java provides security and robustness by building a
                 high-level security model atop the foundation of memory
                 protection. Unfortunately, any native code linked into
                 a Java program --- including the million lines used to
                 implement the standard library --- is able to bypass
                 both the memory protection and the higher-level
                 policies. We present a hardware-assisted implementation
                 of the Java native code interface, which extends the
                 guarantees required for Java's security model to native
                 code. Our design supports safe direct access to buffers
                 owned by the JVM, including hardware-enforced read-only
                 access where appropriate. We also present Java language
                 syntax to declaratively describe isolated compartments
                 for native code. We show that it is possible to
                 preserve the memory safety and isolation requirements
                 of the Java security model in C code, allowing native
                 code to run in the same process as Java code with the
                 same impact on security as running equivalent Java
                 code. Our approach has a negligible impact on
                 performance, compared with the existing unsafe native
                 code interface. We demonstrate a prototype
                 implementation running on the CHERI microprocessor
                 synthesized in FPGA.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'17 conference proceedings",
}

@Article{Ge:2017:GGC,
  author =       "Xinyang Ge and Weidong Cui and Trent Jaeger",
  title =        "{GRIFFIN}: Guarding Control Flows Using {Intel}
                 Processor Trace",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "1",
  pages =        "585--598",
  month =        mar,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3093337.3037716",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Jun 5 18:01:58 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Researchers are actively exploring techniques to
                 enforce control-flow integrity (CFI), which restricts
                 program execution to a predefined set of targets for
                 each indirect control transfer to prevent code-reuse
                 attacks. While hardware-assisted CFI enforcement may
                 have the potential for advantages in performance and
                 flexibility over software instrumentation, current
                 hardware-assisted defenses are either incomplete (i.e.,
                 do not enforce all control transfers) or less efficient
                 in comparison. We find that the recent introduction of
                 hardware features to log complete control-flow traces,
                 such as Intel Processor Trace (PT), provides an
                 opportunity to explore how efficient and flexible a
                 hardware-assisted CFI enforcement system may become.
                 While Intel PT was designed to aid in offline debugging
                 and failure diagnosis, we explore its effectiveness for
                 online CFI enforcement over unmodified binaries by
                 designing a parallelized method for enforcing various
                 types of CFI policies. We have implemented a prototype
                 called GRIFFIN in the Linux 4.2 kernel that enables
                 complete CFI enforcement over a variety of software,
                 including the Firefox browser and its jitted code. Our
                 experiments show that GRIFFIN can enforce fine-grained
                 CFI policies with shadow stack as recommended by
                 researchers at a performance that is comparable to
                 software-only instrumentation techniques. In addition,
                 we find that alternative logging approaches yield
                 significant performance improvements for trace
                 processing, identifying opportunities for further
                 hardware assistance.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'17 conference proceedings",
}

@Article{Delimitrou:2017:BKW,
  author =       "Christina Delimitrou and Christos Kozyrakis",
  title =        "{Bolt}: {I} Know What You Did Last Summer \ldots{} In
                 The Cloud",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "1",
  pages =        "599--613",
  month =        mar,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3093337.3037703",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Jun 5 18:01:58 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Cloud providers routinely schedule multiple
                 applications per physical host to increase efficiency.
                 The resulting interference on shared resources often
                 leads to performance degradation and, more importantly,
                 security vulnerabilities. Interference can leak
                 important information ranging from a service's
                 placement to confidential data, like private keys. We
                 present Bolt, a practical system that accurately
                 detects the type and characteristics of applications
                 sharing a cloud platform based on the interference an
                 adversary sees on shared resources. Bolt leverages
                 online data mining techniques that only require 2-5
                 seconds for detection. In a multi-user study on EC2,
                 Bolt correctly identifies the characteristics of 385
                 out of 436 diverse workloads. Extracting this
                 information enables a wide spectrum of
                 previously-impractical cloud attacks, including denial
                 of service attacks (DoS) that increase tail latency by
                 140x, as well as resource freeing (RFA) and
                 co-residency attacks. Finally, we show that while
                 advanced isolation mechanisms, such as cache
                 partitioning lower detection accuracy, they are
                 insufficient to eliminate these vulnerabilities
                 altogether. To do so, one must either disallow core
                 sharing, or only allow it between threads of the same
                 application, leading to significant inefficiencies and
                 performance penalties.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'17 conference proceedings",
}

@Article{Kang:2017:NCI,
  author =       "Yiping Kang and Johann Hauswald and Cao Gao and Austin
                 Rovinski and Trevor Mudge and Jason Mars and Lingjia
                 Tang",
  title =        "{Neurosurgeon}: Collaborative Intelligence Between the
                 Cloud and Mobile Edge",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "1",
  pages =        "615--629",
  month =        mar,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3093337.3037698",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Jun 5 18:01:58 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "The computation for today's intelligent personal
                 assistants such as Apple Siri, Google Now, and
                 Microsoft Cortana, is performed in the cloud. This
                 cloud-only approach requires significant amounts of
                 data to be sent to the cloud over the wireless network
                 and puts significant computational pressure on the
                 datacenter. However, as the computational resources in
                 mobile devices become more powerful and energy
                 efficient, questions arise as to whether this
                 cloud-only processing is desirable moving forward, and
                 what are the implications of pushing some or all of
                 this compute to the mobile devices on the edge. In this
                 paper, we examine the status quo approach of cloud-only
                 processing and investigate computation partitioning
                 strategies that effectively leverage both the cycles in
                 the cloud and on the mobile device to achieve low
                 latency, low energy consumption, and high datacenter
                 throughput for this class of intelligent applications.
                 Our study uses 8 intelligent applications spanning
                 computer vision, speech, and natural language domains,
                 all employing state-of-the-art Deep Neural Networks
                 (DNNs) as the core machine learning technique. We find
                 that given the characteristics of DNN algorithms, a
                 fine-grained, layer-level computation partitioning
                 strategy based on the data and computation variations
                 of each layer within a DNN has significant latency and
                 energy advantages over the status quo approach. Using
                 this insight, we design Neurosurgeon, a lightweight
                 scheduler to automatically partition DNN computation
                 between mobile devices and datacenters at the
                 granularity of neural network layers. Neurosurgeon does
                 not require per-application profiling. It adapts to
                 various DNN architectures, hardware platforms, wireless
                 networks, and server load levels, intelligently
                 partitioning computation for best latency or best
                 mobile energy. We evaluate Neurosurgeon on a
                 state-of-the-art mobile development platform and show
                 that it improves end-to-end latency by 3.1X on average
                 and up to 40.7X, reduces mobile energy consumption by
                 59.5\% on average and up to 94.7\%, and improves
                 datacenter throughput by 1.5X on average and up to
                 6.7X.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'17 conference proceedings",
}

@Article{Agarwal:2017:TAT,
  author =       "Neha Agarwal and Thomas F. Wenisch",
  title =        "{Thermostat}: Application-transparent Page Management
                 for Two-tiered Main Memory",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "1",
  pages =        "631--644",
  month =        mar,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3093337.3037706",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Jun 5 18:01:58 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "The advent of new memory technologies that are denser
                 and cheaper than commodity DRAM has renewed interest in
                 two-tiered main memory schemes. Infrequently accessed
                 application data can be stored in such memories to
                 achieve significant memory cost savings. Past research
                 on two-tiered main memory has assumed a 4KB page size.
                 However, 2MB huge pages are performance critical in
                 cloud applications with large memory footprints,
                 especially in virtualized cloud environments, where
                 nested paging drastically increases the cost of 4KB
                 page management. We present Thermostat, an
                 application-transparent huge-page-aware mechanism to
                 place pages in a dual-technology hybrid memory system
                 while achieving both the cost advantages of two-tiered
                 memory and performance advantages of transparent huge
                 pages. We present an online page classification
                 mechanism that accurately classifies both 4KB and 2MB
                 pages as hot or cold while incurring no observable
                 performance overhead across several representative
                 cloud applications. We implement Thermostat in Linux
                 kernel version 4.5 and evaluate its effectiveness on
                 representative cloud computing workloads running under
                 KVM virtualization. We emulate slow memory with
                 performance characteristics approximating near-future
                 high-density memory technology and show that Thermostat
                 migrates up to 50\% of application footprint to slow
                 memory while limiting performance degradation to 3\%,
                 thereby reducing memory cost up to 30\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'17 conference proceedings",
}

@Article{Barbalace:2017:BBH,
  author =       "Antonio Barbalace and Robert Lyerly and Christopher
                 Jelesnianski and Anthony Carno and Ho-Ren Chuang and
                 Vincent Legout and Binoy Ravindran",
  title =        "Breaking the Boundaries in Heterogeneous-{ISA}
                 Datacenters",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "1",
  pages =        "645--659",
  month =        mar,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3093337.3037738",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Jun 5 18:01:58 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Energy efficiency is one of the most important design
                 considerations in running modern datacenters.
                 Datacenter operating systems rely on software
                 techniques such as execution migration to achieve
                 energy efficiency across pools of machines. Execution
                 migration is possible in datacenters today because they
                 consist mainly of homogeneous-ISA machines. However,
                 recent market trends indicate that alternate ISAs such
                 as ARM and PowerPC are pushing into the datacenter,
                 meaning current execution migration techniques are no
                 longer applicable. How can execution migration be
                 applied in future heterogeneous-ISA datacenters? In
                 this work we present a compiler, runtime, and an
                 operating system extension for enabling execution
                 migration between heterogeneous-ISA servers. We present
                 a new multi-ISA binary architecture and
                 heterogeneous-OS containers for facilitating efficient
                 migration of natively-compiled applications. We build
                 and evaluate a prototype of our design and demonstrate
                 energy savings of up to 66\% for a workload running on
                 an ARM and an x86 server interconnected by a high-speed
                 network.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'17 conference proceedings",
}

@Article{Lustig:2017:ASC,
  author =       "Daniel Lustig and Andrew Wright and Alexandros
                 Papakonstantinou and Olivier Giroux",
  title =        "Automated Synthesis of Comprehensive Memory Model
                 Litmus Test Suites",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "1",
  pages =        "661--675",
  month =        mar,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3093337.3037723",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Jun 5 18:01:58 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "The memory consistency model is a fundamental part of
                 any shared memory architecture or programming model.
                 Modern weak memory models are notoriously difficult to
                 define and to implement correctly. Most real-world
                 programming languages, compilers, and
                 (micro)architectures therefore rely heavily on
                 black-box testing methodologies. The success of such
                 techniques requires that the suite of litmus tests used
                 to perform the testing be comprehensive--it should
                 ideally stress all obscure corner cases of the model
                 and of its implementation. Most litmus test suites
                 today are generated from some combination of manual
                 effort and randomization; however, the complex and
                 subtle nature of contemporary memory models means that
                 manual effort is both error-prone and subject to
                 incomplete coverage. This paper presents a methodology
                 for synthesizing comprehensive litmus test suites
                 directly from a memory model specification. By
                 construction, these suites contain all tests satisfying
                 a minimality criterion: that no synchronization
                 mechanism in the test can be weakened without causing
                 new behaviors to become observable. We formalize this
                 notion using the Alloy modeling language, and we apply
                 it to a number of existing and newly-proposed memory
                 models. Our results show not only that this synthesis
                 technique can automatically reproduce all
                 manually-generated tests from existing suites, but also
                 that it discovers new tests that are not as well
                 studied.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'17 conference proceedings",
}

@Article{Liu:2017:DAD,
  author =       "Haopeng Liu and Guangpu Li and Jeffrey F. Lukman and
                 Jiaxin Li and Shan Lu and Haryadi S. Gunawi and Chen
                 Tian",
  title =        "{DCatch}: Automatically Detecting Distributed
                 Concurrency Bugs in Cloud Systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "1",
  pages =        "677--691",
  month =        mar,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3093337.3037735",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Jun 5 18:01:58 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "In big data and cloud computing era, reliability of
                 distributed systems is extremely important.
                 Unfortunately, distributed concurrency bugs, referred
                 to as DCbugs, widely exist. They hide in the large
                 state space of distributed cloud systems and manifest
                 non-deterministically depending on the timing of
                 distributed computation and communication. Effective
                 techniques to detect DCbugs are desired. This paper
                 presents a pilot solution, DCatch, in the world of
                 DCbug detection. DCatch predicts DCbugs by analyzing
                 correct execution of distributed systems. To build
                 DCatch, we design a set of happens-before rules that
                 model a wide variety of communication and concurrency
                 mechanisms in real-world distributed cloud systems. We
                 then build runtime tracing and trace analysis tools to
                 effectively identify concurrent conflicting memory
                 accesses in these systems. Finally, we design tools to
                 help prune false positives and trigger DCbugs. We have
                 evaluated DCatch on four representative open-source
                 distributed cloud systems, Cassandra, Hadoop MapReduce,
                 HBase, and ZooKeeper. By monitoring correct execution
                 of seven workloads on these systems, DCatch reports 32
                 DCbugs, with 20 of them being truly harmful.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'17 conference proceedings",
}

@Article{Mashtizadeh:2017:TPD,
  author =       "Ali Jos{\'e} Mashtizadeh and Tal Garfinkel and David
                 Terei and David Mazieres and Mendel Rosenblum",
  title =        "Towards Practical Default-On Multi-Core Record\slash
                 Replay",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "1",
  pages =        "693--708",
  month =        mar,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3093337.3037751",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Jun 5 18:01:58 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "We present Castor, a record/replay system for
                 multi-core applications that provides consistently low
                 and predictable overheads. With Castor, developers can
                 leave record and replay on by default, making it
                 practical to record and reproduce production bugs, or
                 employ fault tolerance to recover from hardware
                 failures. Castor is inspired by several observations:
                 First, an efficient mechanism for logging
                 non-deterministic events is critical for recording
                 demanding workloads with low overhead. Through careful
                 use of hardware we were able to increase log throughput
                 by 10x or more, e.g., we could record a server handling
                 10x more requests per second for the same record
                 overhead. Second, most applications can be recorded
                 without modifying source code by using the compiler to
                 instrument language level sources of non-determinism,
                 in conjunction with more familiar techniques like
                 shared library interposition. Third, while Castor
                 cannot deterministically replay all data races, this
                 limitation is generally unimportant in practice,
                 contrary to what prior work has assumed. Castor
                 currently supports applications written in C, C++, and
                 Go on FreeBSD. We have evaluated Castor on parallel and
                 server workloads, including a commercial implementation
                 of memcached in Go, which runs Castor in production.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'17 conference proceedings",
}

@Article{Huang:2017:PSA,
  author =       "Jian Huang and Michael Allen-Bond and Xuechen Zhang",
  title =        "{Pallas}: Semantic-Aware Checking for Finding Deep
                 Bugs in Fast Path",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "1",
  pages =        "709--722",
  month =        mar,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3093337.3037743",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Jun 5 18:01:58 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Software optimization is constantly a serious concern
                 for developing high-performance systems. To accelerate
                 the workflow execution of a specific functionality,
                 software developers usually define and implement a fast
                 path to speed up the critical and commonly executed
                 functions in the workflow. However, producing a
                 bug-free fast path is nontrivial. Our study on the
                 Linux kernel discloses that a committed fast path can
                 have up to 19 follow-up patches for bug fixing, and
                 most of them are deep semantic bugs, which are
                 difficult to be pinpointed by existing bug-finding
                 tools. In this paper, we present such a new category of
                 software bugs based on our fast-path bug study across
                 various system software including virtual memory
                 manager, file systems, network, and device drivers. We
                 investigate their root causes and identify five
                 error-prone aspects in a fast path: path state, trigger
                 condition, path output, fault handling, and assistant
                 data structure. We find that many of the deep bugs can
                 be prevented by applying static analysis incorporating
                 simple semantic information. We extract a set of rules
                 based on our findings and build a toolkit PALLAS to
                 check fast-path bugs. The evaluation results show that
                 PALLAS can effectively reveal fast-path bugs in a
                 variety of systems including Linux kernel, mobile
                 operating system, software-defined networking system,
                 and web browser.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'17 conference proceedings",
}

@Article{Kotra:2017:HSC,
  author =       "Jagadish B. Kotra and Narges Shahidi and Zeshan A.
                 Chishti and Mahmut T. Kandemir",
  title =        "Hardware-Software Co-design to Mitigate {DRAM} Refresh
                 Overheads: a Case for Refresh-Aware Process
                 Scheduling",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "1",
  pages =        "723--736",
  month =        mar,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3093337.3037724",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Jun 5 18:01:58 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "DRAM cells need periodic refresh to maintain data
                 integrity. With high capacity DRAMs, DRAM refresh poses
                 a significant performance bottleneck as the number of
                 rows to be refreshed (and hence the refresh cycle time,
                 tRFC) with each refresh command increases. Modern day
                 DRAMs perform refresh at a rank-level, while LPDDRs
                 used in mobile environments support refresh at a
                 per-bank level. Rank-level refresh degrades the
                 performance significantly since none of the banks in a
                 rank can serve the on-demand requests. Per-bank refresh
                 alleviates some of the performance bottlenecks as the
                 other banks in a rank are available for on-demand
                 requests. Typical DRAM retention time is in the order
                 several of milliseconds, viz, 64msec for environments
                 operating in temperatures below 85 deg C and 32msec for
                 environments operating above 85 deg C. With systems
                 moving towards increased consolidation (ex: virtualized
                 environments), DRAM refresh becomes a significant
                 bottleneck as it reduces the available overall DRAM
                 bandwidth per task. In this work, we propose a
                 hardware-software co-design to mitigate DRAM refresh
                 overheads by exposing the hardware address mapping and
                 DRAM refresh schedule to the Operating System. We
                 propose a novel DRAM refresh-aware process scheduling
                 algorithm in OS which schedules applications on cores
                 such that none of the on-demand requests from the
                 application are stalled by refreshes. Extensive
                 evaluation of our proposed co-design on
                 multi-programmed SPEC CPU2006 workloads show
                 significant performance improvement compared to the
                 previously proposed hardware only approaches.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'17 conference proceedings",
}

@Article{Kim:2017:KPC,
  author =       "Jinchun Kim and Elvira Teran and Paul V. Gratz and
                 Daniel A. Jim{\'e}nez and Seth H. Pugsley and Chris
                 Wilkerson",
  title =        "Kill the Program Counter: Reconstructing Program
                 Behavior in the Processor Cache Hierarchy",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "1",
  pages =        "737--749",
  month =        mar,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3093337.3037701",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Jun 5 18:01:58 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Data prefetching and cache replacement algorithms have
                 been intensively studied in the design of high
                 performance microprocessors. Typically, the data
                 prefetcher operates in the private caches and does not
                 interact with the replacement policy in the shared
                 Last-Level Cache (LLC). Similarly, most replacement
                 policies do not consider demand and prefetch requests
                 as different types of requests. In particular, program
                 counter (PC)-based replacement policies cannot learn
                 from prefetch requests since the data prefetcher does
                 not generate a PC value. PC-based policies can also be
                 negatively affected by compiler optimizations. In this
                 paper, we propose a holistic cache management technique
                 called Kill-the-PC (KPC) that overcomes the weaknesses
                 of traditional prefetching and replacement policy
                 algorithms. KPC cache management has three novel
                 contributions. First, a prefetcher which approximates
                 the future use distance of prefetch requests based on
                 its prediction confidence. Second, a simple replacement
                 policy provides similar or better performance than
                 current state-of-the-art PC-based prediction using
                 global hysteresis. Third, KPC integrates prefetching
                 and replacement policy into a whole system which is
                 greater than the sum of its parts. Information from the
                 prefetcher is used to improve the performance of the
                 replacement policy and vice-versa. Finally, KPC removes
                 the need to propagate the PC through entire on-chip
                 cache hierarchy while providing a holistic cache
                 management approach with better performance than
                 state-of-the-art PC-, and non-PC-based schemes. Our
                 evaluation shows that KPC provides 8\% better
                 performance than the best combination of existing
                 prefetcher and replacement policy for multi-core
                 workloads.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'17 conference proceedings",
}

@Article{Gao:2017:TSE,
  author =       "Mingyu Gao and Jing Pu and Xuan Yang and Mark Horowitz
                 and Christos Kozyrakis",
  title =        "{TETRIS}: Scalable and Efficient Neural Network
                 Acceleration with {$3$D} Memory",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "1",
  pages =        "751--764",
  month =        mar,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3093337.3037702",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Jun 5 18:01:58 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "The high accuracy of deep neural networks (NNs) has
                 led to the development of NN accelerators that improve
                 performance by two orders of magnitude. However,
                 scaling these accelerators for higher performance with
                 increasingly larger NNs exacerbates the cost and energy
                 overheads of their memory systems, including the
                 on-chip SRAM buffers and the off-chip DRAM channels.
                 This paper presents the hardware architecture and
                 software scheduling and partitioning techniques for
                 TETRIS, a scalable NN accelerator using 3D memory.
                 First, we show that the high throughput and low energy
                 characteristics of 3D memory allow us to rebalance the
                 NN accelerator design, using more area for processing
                 elements and less area for SRAM buffers. Second, we
                 move portions of the NN computations close to the DRAM
                 banks to decrease bandwidth pressure and increase
                 performance and energy efficiency. Third, we show that
                 despite the use of small SRAM buffers, the presence of
                 3D memory simplifies dataflow scheduling for NN
                 computations. We present an analytical scheduling
                 scheme that matches the efficiency of schedules derived
                 through exhaustive search. Finally, we develop a hybrid
                 partitioning scheme that parallelizes the NN
                 computations over multiple accelerators. Overall, we
                 show that TETRIS improves the performance by 4.1x and
                 reduces the energy by 1.5x over NN accelerators with
                 conventional, low-power DRAM memory systems.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'17 conference proceedings",
}

@Article{Song:2017:HBA,
  author =       "Wonjun Song and Gwangsun Kim and Hyungjoon Jung and
                 Jongwook Chung and Jung Ho Ahn and Jae W. Lee and John
                 Kim",
  title =        "History-Based Arbitration for Fairness in
                 Processor-Interconnect of {NUMA} Servers",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "1",
  pages =        "765--777",
  month =        mar,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3093337.3037753",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Jun 5 18:01:58 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "NUMA (non-uniform memory access) servers are commonly
                 used in high-performance computing and datacenters.
                 Within each server, a processor-interconnect (e.g.,
                 Intel QPI, AMD HyperTransport) is used to communicate
                 between the different sockets or nodes. In this work,
                 we explore the impact of the processor-interconnect on
                 overall performance --- in particular, the performance
                 unfairness caused by processor-interconnect
                 arbitration. It is well known that locally-fair
                 arbitration does not guarantee globally-fair bandwidth
                 sharing as closer nodes receive more bandwidth in a
                 multi-hop network. However, this work demonstrates that
                 the opposite can occur in a commodity NUMA server where
                 remote nodes receive higher bandwidth (and perform
                 better). We analyze this problem and identify that this
                 occurs because of external concentration used in router
                 micro-architectures for processor-interconnects without
                 globally-aware arbitration. While accessing remote
                 memory can occur in any NUMA system, performance
                 unfairness (or performance variation) is more critical
                 in cloud computing and virtual machines with shared
                 resources. We demonstrate how this unfairness creates
                 significant performance variation when a workload is
                 executed on the Xen virtualization platform. We then
                 provide analysis using synthetic workloads to better
                 understand the source of unfairness and eliminate the
                 impact of other shared resources, including the shared
                 last-level cache and main memory. To provide fairness,
                 we propose a novel, history-based arbitration that
                 tracks the history of arbitration grants made in the
                 previous history window. A weighted arbitration is done
                 based on the history to provide global fairness.
                 Through simulations, we show our proposed history-based
                 arbitration can provide global fairness and minimize
                 the processor- interconnect performance unfairness at
                 low cost.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'17 conference proceedings",
}

@Article{Misra:2017:ELT,
  author =       "Pulkit A. Misra and Jeffrey S. Chase and Johannes
                 Gehrke and Alvin R. Lebeck",
  title =        "Enabling Lightweight Transactions with Precision
                 Time",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "1",
  pages =        "779--794",
  month =        mar,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3093337.3037722",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Jun 5 18:01:58 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Distributed transactional storage is an important
                 service in today's data centers. Achieving high
                 performance without high complexity is often a
                 challenge for these systems due to sophisticated
                 consistency protocols and multiple layers of
                 abstraction. In this paper we show how to combine two
                 emerging technologies---Software-Defined Flash (SDF)
                 and precise synchronized clocks---to improve
                 performance and reduce complexity for transactional
                 storage within the data center. We present a
                 distributed transactional system (called MILANA) as a
                 layer above a durable multi-version key-value store
                 (called SEMEL) for read-heavy workloads within a data
                 center. SEMEL exploits write behavior of SSDs to
                 maintain a time-ordered sequence of versions for each
                 key efficiently and durably. MILANA adds a variant of
                 optimistic concurrency control above SEMEL's API to
                 service read requests from a consistent snapshot and to
                 enable clients to make fast local commit or abort
                 decisions for read-only transactions. Experiments with
                 the prototype reveal up to 43\% lower transaction abort
                 rates using IEEE Precision Time Protocol (PTP) vs. the
                 standard Network Time Protocol (NTP). Under the Retwis
                 benchmark, client-local validation of read-only
                 transactions yields a 35\% reduction in latency and
                 55\% increase in transaction throughput.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'17 conference proceedings",
}

@Article{Liu:2017:ITN,
  author =       "Ming Liu and Liang Luo and Jacob Nelson and Luis Ceze
                 and Arvind Krishnamurthy and Kishore Atreya",
  title =        "{IncBricks}: Toward In-Network Computation with an
                 In-Network Cache",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "1",
  pages =        "795--809",
  month =        mar,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3093337.3037731",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Jun 5 18:01:58 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "The emergence of programmable network devices and the
                 increasing data traffic of datacenters motivate the
                 idea of in-network computation. By offloading compute
                 operations onto intermediate networking devices (e.g.,
                 switches, network accelerators, middleboxes), one can
                 (1) serve network requests on the fly with low latency;
                 (2) reduce datacenter traffic and mitigate network
                 congestion; and (3) save energy by running servers in a
                 low-power mode. However, since (1) existing switch
                 technology doesn't provide general computing
                 capabilities, and (2) commodity datacenter networks are
                 complex (e.g., hierarchical fat-tree topologies,
                 multipath communication), enabling in-network
                 computation inside a datacenter is challenging. In this
                 paper, as a step towards in-network computing, we
                 present IncBricks, an in-network caching fabric with
                 basic computing primitives. IncBricks is a
                 hardware-software co-designed system that supports
                 caching in the network using a programmable network
                 middlebox. As a key-value store accelerator, our
                 prototype lowers request latency by over 30\% and
                 doubles throughput for 1024 byte values in a common
                 cluster configuration. Our results demonstrate the
                 effectiveness of in-network computing and that
                 efficient datacenter network request processing is
                 possible if we carefully split the computation across
                 the different programmable computing elements in a
                 datacenter, including programmable switches, network
                 accelerators, and end hosts.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'17 conference proceedings",
}

@Article{Akturk:2017:AAA,
  author =       "Ismail Akturk and Ulya R. Karpuzcu",
  title =        "{AMNESIAC}: Amnesic Automatic Computer",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "1",
  pages =        "811--824",
  month =        mar,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3093337.3037741",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Jun 5 18:01:58 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Due to imbalances in technology scaling, the energy
                 consumption of data storage and communication by far
                 exceeds the energy consumption of actual data
                 production, i.e., computation. As a consequence,
                 recomputing data can become more energy efficient than
                 storing and retrieving precomputed data. At the same
                 time, recomputation can relax the pressure on the
                 memory hierarchy and the communication bandwidth. This
                 study hence assesses the energy efficiency prospects of
                 trading computation for communication. We introduce an
                 illustrative proof-of-concept design, identify
                 practical limitations, and provide design guidelines.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'17 conference proceedings",
}

@Article{Bai:2017:VRE,
  author =       "Yuxin Bai and Victor W. Lee and Engin Ipek",
  title =        "Voltage Regulator Efficiency Aware Power Management",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "1",
  pages =        "825--838",
  month =        mar,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3093337.3037717",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Jun 5 18:01:58 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Conventional off-chip voltage regulators are typically
                 bulky and slow, and are inefficient at exploiting
                 system and workload variability using Dynamic Voltage
                 and Frequency Scaling (DVFS). On-die integration of
                 voltage regulators has the potential to increase the
                 energy efficiency of computer systems by enabling power
                 control at a fine granularity in both space and time.
                 The energy conversion efficiency of on-chip regulators,
                 however, is typically much lower than off-chip
                 regulators, which results in significant energy losses.
                 Fine-grained power control and high voltage regulator
                 efficiency are difficult to achieve simultaneously,
                 with either emerging on-chip or conventional off-chip
                 regulators. A voltage conversion framework that relies
                 on a hierarchy of off-chip switching regulators and
                 on-chip linear regulators is proposed to enable
                 fine-grained power control with a regulator efficiency
                 greater than 90\%. A DVFS control policy that is based
                 on a reinforcement learning (RL) approach is developed
                 to exploit the proposed framework. Per-core RL agents
                 learn and improve their control policies independently,
                 while retaining the ability to coordinate their actions
                 to accomplish system level power management objectives.
                 When evaluated on a mix of 14 parallel and 13
                 multiprogrammed workloads, the proposed voltage
                 conversion framework achieves 18\% greater energy
                 efficiency than a conventional framework that uses
                 on-chip switching regulators. Moreover, when the RL
                 based DVFS control policy is used to control the
                 proposed voltage conversion framework, the system
                 achieves a 21\% higher energy efficiency over a
                 baseline oracle policy with coarse-grained power
                 control capability.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
  remark =       "ASPLOS'17 conference proceedings",
}

@Article{Jouppi:2017:DPA,
  author =       "Norman P. Jouppi and Cliff Young and Nishant Patil and
                 David Patterson and Gaurav Agrawal and Raminder Bajwa
                 and Sarah Bates and Suresh Bhatia and Nan Boden and Al
                 Borchers and Rick Boyle and Pierre-luc Cantin and
                 Clifford Chao and Chris Clark and Jeremy Coriell and
                 Mike Daley and Matt Dau and Jeffrey Dean and Ben Gelb
                 and Tara Vazir Ghaemmaghami and Rajendra Gottipati and
                 William Gulland and Robert Hagmann and C. Richard Ho
                 and Doug Hogberg and John Hu and Robert Hundt and Dan
                 Hurt and Julian Ibarz and Aaron Jaffey and Alek
                 Jaworski and Alexander Kaplan and Harshit Khaitan and
                 Daniel Killebrew and Andy Koch and Naveen Kumar and
                 Steve Lacy and James Laudon and James Law and Diemthu
                 Le and Chris Leary and Zhuyuan Liu and Kyle Lucke and
                 Alan Lundin and Gordon MacKean and Adriana Maggiore and
                 Maire Mahony and Kieran Miller and Rahul Nagarajan and
                 Ravi Narayanaswami and Ray Ni and Kathy Nix and Thomas
                 Norrie and Mark Omernick and Narayana Penukonda and
                 Andy Phelps and Jonathan Ross and Matt Ross and Amir
                 Salek and Emad Samadiani and Chris Severn and Gregory
                 Sizikov and Matthew Snelham and Jed Souter and Dan
                 Steinberg and Andy Swing and Mercedes Tan and Gregory
                 Thorson and Bo Tian and Horia Toma and Erick Tuttle and
                 Vijay Vasudevan and Richard Walter and Walter Wang and
                 Eric Wilcox and Doe Hyun Yoon",
  title =        "In-Datacenter Performance Analysis of a Tensor
                 Processing Unit",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "2",
  pages =        "1--12",
  month =        may,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3140659.3080246",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Sep 15 11:09:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Many architects believe that major improvements in
                 cost-energy-performance must now come from
                 domain-specific hardware. This paper evaluates a custom
                 ASIC---called a Tensor Processing Unit (TPU) ---
                 deployed in datacenters since 2015 that accelerates the
                 inference phase of neural networks (NN). The heart of
                 the TPU is a 65,536 8-bit MAC matrix multiply unit that
                 offers a peak throughput of 92 TeraOps/second (TOPS)
                 and a large (28 MiB) software-managed on-chip memory.
                 The TPU's deterministic execution model is a better
                 match to the 99th-percentile response-time requirement
                 of our NN applications than are the time-varying
                 optimizations of CPUs and GPUs that help average
                 throughput more than guaranteed latency. The lack of
                 such features helps explain why, despite having myriad
                 MACs and a big memory, the TPU is relatively small and
                 low power. We compare the TPU to a server-class Intel
                 Haswell CPU and an Nvidia K80 GPU, which are
                 contemporaries deployed in the same datacenters. Our
                 workload, written in the high-level TensorFlow
                 framework, uses production NN applications (MLPs, CNNs,
                 and LSTMs) that represent 95\% of our datacenters' NN
                 inference demand. Despite low utilization for some
                 applications, the TPU is on average about 15X --- 30X
                 faster than its contemporary GPU or CPU, with TOPS/Watt
                 about 30X --- 80X higher. Moreover, using the CPU's
                 GDDR5 memory in the TPU would triple achieved TOPS and
                 raise TOPS/Watt to nearly 70X the GPU and 200X the
                 CPU.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Venkataramani:2017:SSC,
  author =       "Swagath Venkataramani and Ashish Ranjan and Subarno
                 Banerjee and Dipankar Das and Sasikanth Avancha and
                 Ashok Jagannathan and Ajaya Durg and Dheemanth Nagaraj
                 and Bharat Kaul and Pradeep Dubey and Anand
                 Raghunathan",
  title =        "{ScaleDeep}: a Scalable Compute Architecture for
                 Learning and Evaluating Deep Networks",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "2",
  pages =        "13--26",
  month =        may,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3140659.3080244",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Sep 15 11:09:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Deep Neural Networks (DNNs) have demonstrated
                 state-of-the-art performance on a broad range of tasks
                 involving natural language, speech, image, and video
                 processing, and are deployed in many real world
                 applications. However, DNNs impose significant
                 computational challenges owing to the complexity of the
                 networks and the amount of data they process, both of
                 which are projected to grow in the future. To improve
                 the efficiency of DNNs, we propose ScaleDeep, a dense,
                 scalable server architecture, whose processing, memory
                 and interconnect subsystems are specialized to leverage
                 the compute and communication characteristics of DNNs.
                 While several DNN accelerator designs have been
                 proposed in recent years, the key difference is that
                 ScaleDeep primarily targets DNN training, as opposed to
                 only inference or evaluation. The key architectural
                 features from which ScaleDeep derives its efficiency
                 are: (i) heterogeneous processing tiles and chips to
                 match the wide diversity in computational
                 characteristics (FLOPs and Bytes/FLOP ratio) that
                 manifest at different levels of granularity in DNNs,
                 (ii) a memory hierarchy and 3-tiered interconnect
                 topology that is suited to the memory access and
                 communication patterns in DNNs, (iii) a low-overhead
                 synchronization mechanism based on hardware data-flow
                 trackers, and (iv) methods to map DNNs to the proposed
                 architecture that minimize data movement and improve
                 core utilization through nested pipelining. We have
                 developed a compiler to allow any DNN topology to be
                 programmed onto ScaleDeep, and a detailed architectural
                 simulator to estimate performance and energy. The
                 simulator incorporates timing and power models of
                 ScaleDeep's components based on synthesis to Intel's
                 14nm technology. We evaluate an embodiment of ScaleDeep
                 with 7032 processing tiles that operates at 600 MHz and
                 has a peak performance of 680 TFLOPs (single precision)
                 and 1.35 PFLOPs (half-precision) at 1.4KW. Across 11
                 state-of-the-art DNNs containing 0.65M-14.9M neurons
                 and 6.8M-145.9M weights, including winners from 5 years
                 of the ImageNet competition, ScaleDeep demonstrates
                 6x-28x speedup at iso-power over the state-of-the-art
                 performance on GPUs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Parashar:2017:SAC,
  author =       "Angshuman Parashar and Minsoo Rhu and Anurag Mukkara
                 and Antonio Puglielli and Rangharajan Venkatesan and
                 Brucek Khailany and Joel Emer and Stephen W. Keckler
                 and William J. Dally",
  title =        "{SCNN}: an Accelerator for Compressed-sparse
                 Convolutional Neural Networks",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "2",
  pages =        "27--40",
  month =        may,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3140659.3080254",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Sep 15 11:09:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Convolutional Neural Networks (CNNs) have emerged as a
                 fundamental technology for machine learning. High
                 performance and extreme energy efficiency are critical
                 for deployments of CNNs, especially in mobile platforms
                 such as autonomous vehicles, cameras, and electronic
                 personal assistants. This paper introduces the Sparse
                 CNN (SCNN) accelerator architecture, which improves
                 performance and energy efficiency by exploiting the
                 zero-valued weights that stem from network pruning
                 during training and zero-valued activations that arise
                 from the common ReLU operator. Specifically, SCNN
                 employs a novel dataflow that enables maintaining the
                 sparse weights and activations in a compressed
                 encoding, which eliminates unnecessary data transfers
                 and reduces storage requirements. Furthermore, the SCNN
                 dataflow facilitates efficient delivery of those
                 weights and activations to a multiplier array, where
                 they are extensively reused; product accumulation is
                 performed in a novel accumulator array. On contemporary
                 neural networks, SCNN can improve both performance and
                 energy by a factor of 2.7x and 2.3x, respectively, over
                 a comparably provisioned dense CNN accelerator.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Cherupalli:2017:BPA,
  author =       "Hari Cherupalli and Henry Duwe and Weidong Ye and
                 Rakesh Kumar and John Sartori",
  title =        "Bespoke Processors for Applications with Ultra-low
                 Area and Power Constraints",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "2",
  pages =        "41--54",
  month =        may,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3140659.3080247",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Sep 15 11:09:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "A large number of emerging applications such as
                 implantables, wearables, printed electronics, and IoT
                 have ultra-low area and power constraints. These
                 applications rely on ultra-low-power general purpose
                 microcontrollers and microprocessors, making them the
                 most abundant type of processor produced and used
                 today. While general purpose processors have several
                 advantages, such as amortized development cost across
                 many applications, they are significantly
                 over-provisioned for many area- and power-constrained
                 systems, which tend to run only one or a small number
                 of applications over their lifetime. In this paper, we
                 make a case for bespoke processor design, an automated
                 approach that tailors a general purpose processor IP to
                 a target application by removing all gates from the
                 design that can never be used by the application. Since
                 removed gates are never used by an application, bespoke
                 processors can achieve significantly lower area and
                 power than their general purpose counterparts without
                 any performance degradation. Also, gate removal can
                 expose additional timing slack that can be exploited to
                 increase area and power savings or performance of a
                 bespoke design. Bespoke processor design reduces area
                 and power by 62\% and 50\%, on average, while
                 exploiting exposed timing slack improves average power
                 savings to 65\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Chen:2017:PGF,
  author =       "Yajing Chen and Shengshuo Lu and Cheng Fu and David
                 Blaauw and Ronald {Dreslinski, Jr.} and Trevor Mudge
                 and Hun-Seok Kim",
  title =        "A Programmable {Galois} Field Processor for the
                 {Internet of Things}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "2",
  pages =        "55--68",
  month =        may,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3140659.3080227",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Sep 15 11:09:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "This paper investigates the feasibility of a unified
                 processor architecture to enable error coding
                 flexibility and secure communication in low power
                 Internet of Things (IoT) wireless networks. Error
                 coding flexibility for wireless communication allows
                 IoT applications to exploit the large tradeoff space in
                 data rate, link distance and energy-efficiency. As a
                 solution, we present a light-weight Galois Field (GF)
                 processor to enable energy-efficient block coding and
                 symmetric/asymmetric cryptography kernel processing for
                 a wide range of GF sizes (2m, m = 2, 3, ..., 233) and
                 arbitrary irreducible polynomials. Program directed
                 connections among primitive GF arithmetic units enable
                 dynamically configured parallelism to efficiently
                 perform either four-way SIMD 5- to 8-bit GF operations,
                 including multiplicative inverse, or a wide bit-width
                 (e.g., 32-bit) GF product in a single cycle. To
                 illustrate our ideas, we synthesized our GF processor
                 in a 28nm technology. Compared to a baseline software
                 implementation optimized for a general purpose ARM M0+
                 processor, our processor exhibits a 5-20 x speedup for
                 a range of error correction codes and
                 symmetric/asymmetric cryptography applications.
                 Additionally, our proposed GF processor consumes 431 $
                 \mu $W at 0.9V and 100MHz, and achieves 35.5pJ/b energy
                 efficiency while executing AES operations at 12.2Mbps.
                 We achieve this within an area of 0.01mm2.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Wang:2017:XCE,
  author =       "Aosen Wang and Lizhong Chen and Wenyao Xu",
  title =        "{XPro}: a Cross-End Processing Architecture for Data
                 Analytics in Wearables",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "2",
  pages =        "69--80",
  month =        may,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3140659.3080219",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Sep 15 11:09:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Wearable computing systems have spurred many
                 opportunities to continuously monitor human bodies with
                 sensors worn on or implanted in the body. These
                 emerging platforms have started to revolutionize many
                 fields, including healthcare and wellness applications,
                 particularly when integrated with intelligent analytic
                 capabilities. However, a significant challenge that
                 computer architects are facing is how to embed
                 sophisticated analytic capabilities in wearable
                 computers in an energy-efficient way while not
                 compromising system performance. In this paper, we
                 present XPro, a novel cross-end analytic engine
                 architecture for wearable computing systems. The
                 proposed cross-end architecture is able to realize a
                 generic classification design across wearable sensors
                 and a data aggregator with high energy-efficiency. To
                 facilitate the practical use of XPro, we also develop
                 an Automatic XPro Generator that formally generates
                 XPro instances according to specific design
                 constraints. As a proof of concept, we study the design
                 and implementation of XPro with six different health
                 applications. Evaluation results show that, compared
                 with state-of-the-art methods, XPro can increase the
                 battery life of the sensor node by 1.6-2.4X while at
                 the same time reducing system delay by 15.6-60.8\% for
                 wearable computing systems.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Weisse:2017:RLC,
  author =       "Ofir Weisse and Valeria Bertacco and Todd Austin",
  title =        "Regaining Lost Cycles with {HotCalls}: a Fast
                 Interface for {SGX} Secure Enclaves",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "2",
  pages =        "81--93",
  month =        may,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3140659.3080208",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Sep 15 11:09:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Intel's SGX secure execution technology allows running
                 computations on secret data using untrusted servers.
                 While recent work showed how to port applications and
                 large-scale computations to run under SGX, the
                 performance implications of using the technology
                 remains an open question. We present the first
                 comprehensive quantitative study to evaluate the
                 performance of SGX. We show that straightforward use of
                 SGX library primitives for calling functions add
                 between 8,200 --- 17,000 cycles overhead, compared to
                 150 cycles of a typical system call. We quantify the
                 performance impact of these library calls and show that
                 in applications with high system calls frequency, such
                 as memcached, openVPN, and lighttpd, which all have
                 high bandwidth network requirements, the performance
                 degradation may be as high as 79\%. We investigate the
                 sources of this performance degradation by leveraging a
                 new set of microbenchmarks for SGX-specific operations
                 such as enclave entry-calls and out-calls, and
                 encrypted memory I/O accesses. We leverage the insights
                 we gain from these analyses to design a new SGX
                 interface framework HotCalls. HotCalls are based on a
                 synchronization spin-lock mechanism and provide a
                 13-27x speedup over the default interface. It can
                 easily be integrated into existing code, making it a
                 practical solution. Compared to a baseline SGX
                 implementation of memcached, openVPN, and lighttpd ---
                 we show that using the new interface boosts the
                 throughput by 2.6-3.7x, and reduces application latency
                 by 62-74\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Aga:2017:ISM,
  author =       "Shaizeen Aga and Satish Narayanasamy",
  title =        "{InvisiMem}: Smart Memory Defenses for Memory Bus Side
                 Channel",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "2",
  pages =        "94--106",
  month =        may,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3140659.3080232",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Sep 15 11:09:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "A practically feasible low-overhead hardware design
                 that provides strong defenses against memory bus side
                 channel remains elusive. This paper observes that smart
                 memory, memory with compute capability and a packetized
                 interface, can dramatically simplify this problem.
                 InvisiMem expands the trust base to include the logic
                 layer in the smart memory to implement cryptographic
                 primitives, which aid in addressing several memory bus
                 side channel vulnerabilities efficiently. This allows
                 the secure host processor to send encrypted addresses
                 over the untrusted memory bus, and thereby eliminates
                 the need for expensive address obfuscation techniques
                 based on Oblivious RAM (ORAM). In addition, smart
                 memory enables efficient solutions for ensuring
                 freshness without using expensive Merkle trees, and
                 mitigates memory bus timing channel using constant
                 heart-beat packets. We demonstrate that InvisiMem
                 designs have one to two orders of magnitude of lower
                 overheads for performance, space, energy, and memory
                 bandwidth, compared to prior solutions.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Awad:2017:OLO,
  author =       "Amro Awad and Yipeng Wang and Deborah Shands and Yan
                 Solihin",
  title =        "{ObfusMem}: a Low-Overhead Access Obfuscation for
                 Trusted Memories",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "2",
  pages =        "107--119",
  month =        may,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3140659.3080230",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Sep 15 11:09:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Trustworthy software requires strong privacy and
                 security guarantees from a secure trust base in
                 hardware. While chipmakers provide hardware support for
                 basic security and privacy primitives such as enclaves
                 and memory encryption. these primitives do not address
                 hiding of the memory access pattern, information about
                 which may enable attacks on the system or reveal
                 characteristics of sensitive user data.
                 State-of-the-art approaches to protecting the access
                 pattern are largely based on Oblivious RAM (ORAM).
                 Unfortunately, current ORAM implementations suffer from
                 very significant practicality and overhead concerns,
                 including roughly an order of magnitude slowdown, more
                 than 100\% memory capacity overheads, and the potential
                 for system deadlock. Memory technology trends are
                 moving towards 3D and 2.5D integration, enabling
                 significant logic capabilities and sophisticated memory
                 interfaces. Leveraging the trends, we propose a new
                 approach to access pattern obfuscation, called
                 ObfusMem. ObfusMem adds the memory to the trusted
                 computing base and incorporates cryptographic engines
                 within the memory. ObfusMem encrypts commands and
                 addresses on the memory bus, hence the access pattern
                 is cryptographically obfuscated from external
                 observers. Our evaluation shows that ObfusMem incurs an
                 overhead of 10.9\% on average, which is about an order
                 of magnitude faster than ORAM implementations.
                 Furthermore, ObfusMem does not incur capacity overheads
                 and does not amplify writes. We analyze and compare the
                 security protections provided by ObfusMem and ORAM, and
                 highlight their differences.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Khatamifard:2017:TTA,
  author =       "S. Karen Khatamifard and Longfei Wang and Weize Yu and
                 Sel{\c{c}}uk K{\"o}se and Ulya R. Karpuzcu",
  title =        "{ThermoGater}: Thermally-Aware On-Chip Voltage
                 Regulation",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "2",
  pages =        "120--132",
  month =        may,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3140659.3080250",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Sep 15 11:09:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Tailoring the operating voltage to fine-grain temporal
                 changes in the power and performance needs of the
                 workload can effectively enhance power efficiency.
                 Therefore, power-limited computing platforms of today
                 widely deploy integrated (i.e., on-chip) voltage
                 regulation which enables fast fine-grain voltage
                 control. Voltage regulators convert and distribute
                 power from an external energy source to the processor.
                 Unfortunately, power conversion loss is inevitable and
                 projected integrated regulator designs are unlikely to
                 eliminate this loss even asymptotically. Reconfigurable
                 power delivery by selective shut-down, i.e., gating, of
                 distributed on-chip regulators in response to
                 spatio-temporal changes in power demand can sustain
                 operation at the minimum conversion loss. However, even
                 the minimum conversion loss is sizable, and as
                 conversion loss gets dissipated as heat, on-chip
                 regulators can easily cause thermal emergencies due to
                 their small footprint. Although reconfigurable
                 distributed on-chip power delivery is emerging as a new
                 design paradigm to enforce sustained operation at
                 minimum possible power conversion loss, thermal
                 implications have been overlooked at the architectural
                 level. This paper hence provides a thermal
                 characterization. We introduce ThermoGater, an
                 architectural governor for a collection of practical,
                 thermally-aware regulator gating policies to mitigate
                 (if not prevent) regulator-induced thermal emergencies,
                 which also consider potential implications for voltage
                 noise. Practical ThermoGater policies can not only
                 sustain minimum power conversion loss throughout
                 execution effectively, but also keep the maximum
                 temperature (thermal gradient) across chip within
                 0.6${}^\circ $C (0.3${}^\circ $C) on average in
                 comparison to thermally-optimal oracular regulator
                 gating, while the maximum voltage noise stays within
                 1.0\% of the best case voltage noise profile.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Yang:2017:PIP,
  author =       "Hailong Yang and Quan Chen and Moeiz Riaz and Zhongzhi
                 Luan and Lingjia Tang and Jason Mars",
  title =        "{PowerChief}: Intelligent Power Allocation for
                 Multi-Stage Applications to Improve Responsiveness on
                 Power Constrained {CMP}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "2",
  pages =        "133--146",
  month =        may,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3140659.3080224",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Sep 15 11:09:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Modern user facing applications consist of multiple
                 processing stages with a number of service instances in
                 each stage. The latency profile of these multi-stage
                 applications is intrinsically variable, making it
                 challenging to provide satisfactory responsiveness.
                 Given a limited power budget, improving the end-to-end
                 latency requires intelligently boosting the bottleneck
                 service across stages using multiple boosting
                 techniques. However, prior work fail to acknowledge the
                 multi-stage nature of user-facing applications and
                 perform poorly in improving responsiveness on power
                 constrained CMP, as they are unable to accurately
                 identify bottleneck service and apply the boosting
                 techniques adaptively. In this paper, we present
                 PowerChief, a runtime framework that (1) provides joint
                 design of service and query to monitor the latency
                 statistics across service stages and accurately
                 identifies the bottleneck service during runtime; (2)
                 adaptively chooses the boosting technique to accelerate
                 the bottleneck service with improved responsiveness;
                 (3) dynamically reallocates the constrained power
                 budget across service stages to accommodate the chosen
                 boosting technique. Evaluated with real world
                 multi-stage applications, PowerChief improves the
                 average latency by 20.3x and 32.4x (99\% tail latency
                 by 13.3x and 19.4x) for Sirius and Natural Language
                 Processing applications respectively compared to
                 stage-agnostic power allocation. In addition, for the
                 given QoS target, PowerChief reduces the power
                 consumption of Sirius and Web Search applications by
                 23\% and 33\% respectively over prior work.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ravi:2017:CCH,
  author =       "Gokul Subramanian Ravi and Mikko H. Lipasti",
  title =        "{CHARSTAR: Clock Hierarchy Aware Resource Scaling in
                 Tiled ARchitectures}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "2",
  pages =        "147--160",
  month =        may,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3140659.3080212",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Sep 15 11:09:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "High-performance architectures are over-provisioned
                 with resources to extract the maximum achievable
                 performance out of applications. Two sources of
                 avoidable power dissipation are the leakage power from
                 underutilized resources, along with clock power from
                 the clock hierarchy that feeds these resources. Most
                 reconfiguration mechanisms either focus solely on power
                 gating execution resources alone or in addition, simply
                 turn off the immediate clock tree segment which
                 supplied the clock to those resources. These proposals
                 neither attempt to gate further up the clock hierarchy
                 nor do they involve the clock hierarchy in influencing
                 the reconfiguration decisions. The primary contribution
                 of CHARSTAR is optimizing reconfiguration mechanisms to
                 become clock hierarchy aware. Resource gating decisions
                 are cognizant of the power consumed by each node in the
                 clock hierarchy and additionally, entire branches of
                 the clock tree are greedily shut down whenever
                 possible. The CHARSTAR design is further optimized for
                 balanced spatio-temporal reconfiguration and also
                 enables efficient joint control of resource and
                 frequency scaling. The proposal is implemented by
                 leveraging the inherent advantages of spatial
                 architectures, utilizing a control mechanism driven by
                 a lightweight offline trained neural predictor.
                 CHARSTAR, when deployed on the CRIB tiled
                 microarchitecture, improves processor energy efficiency
                 by 20-25\%, with efficiency improvements of roughly 2x
                 in comparison to a naive power gating mechanism.
                 Alternatively, it improves performance by 10-20\% under
                 varying power and energy constraints.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Sinclair:2017:CRS,
  author =       "Matthew D. Sinclair and Johnathan Alsop and Sarita V.
                 Adve",
  title =        "Chasing Away {RAts}: Semantics and Evaluation for
                 Relaxed Atomics on Heterogeneous Systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "2",
  pages =        "161--174",
  month =        may,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3140659.3080206",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Sep 15 11:09:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "An unambiguous and easy-to-understand memory
                 consistency model is crucial for ensuring correct
                 synchronization and guiding future design of
                 heterogeneous systems. In a widely adopted approach,
                 the memory model guarantees sequential consistency (SC)
                 as long as programmers obey certain rules. The popular
                 data-race-free-0 (DRF0) model exemplifies this
                 SC-centric approach by requiring programmers to avoid
                 data races. Recent industry models, however, have
                 extended such SC-centric models to incorporate relaxed
                 atomics. These extensions can improve performance, but
                 are difficult to specify formally and use correctly.
                 This work addresses the impact of relaxed atomics on
                 consistency models for heterogeneous systems in two
                 ways. First, we introduce a new model,
                 Data-Race-Free-Relaxed (DRFrlx), that extends DRF0 to
                 provide SC-centric semantics for the common use cases
                 of relaxed atomics. Second, we evaluate the performance
                 of relaxed atomics in CPU-GPU systems for these use
                 cases. We find mixed results --- for most cases,
                 relaxed atomics provide only a small benefit in
                 execution time, but for some cases, they help
                 significantly (e.g., up to 51\% for DRFrlx over
                 DRF0).",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Shin:2017:HLL,
  author =       "Seunghee Shin and James Tuck and Yan Solihin",
  title =        "Hiding the Long Latency of Persist Barriers Using
                 Speculative Execution",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "2",
  pages =        "175--186",
  month =        may,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3140659.3080240",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Sep 15 11:09:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Byte-addressable non-volatile memory technology is
                 emerging as an alternative for DRAM for main memory.
                 This new Non-Volatile Main Memory (NVMM) allows
                 programmers to store important data in data structures
                 in memory instead of serializing it to the file system,
                 thereby providing a substantial performance boost.
                 However, modern systems reorder memory operations and
                 utilize volatile caches for better performance, making
                 it difficult to ensure a consistent state in NVMM.
                 Intel recently announced a new set of persistence
                 instructions, clflushopt, clwb, and pcommit. These new
                 instructions make it possible to implement fail-safe
                 code on NVMM, but few workloads have been written or
                 characterized using these new instructions. In this
                 work, we describe how these instructions work and how
                 they can be used to implement write-ahead logging based
                 transactions. We implement several common data
                 structures and kernels and evaluate the performance
                 overhead incurred over traditional non-persistent
                 implementations. In particular, we find that
                 persistence instructions occur in clusters along with
                 expensive fence operations, they have long latency, and
                 they add a significant execution time overhead, on
                 average by 20.3\% over code with logging but without
                 fence instructions to order persists. To deal with this
                 overhead and alleviate the performance bottleneck, we
                 propose to speculate past long latency persistency
                 operations using checkpoint-based processing. Our
                 speculative persistence architecture reduces the
                 execution time overheads to only 3.6\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ros:2017:NSL,
  author =       "Alberto Ros and Trevor E. Carlson and Mehdi Alipour
                 and Stefanos Kaxiras",
  title =        "Non-Speculative Load-Load Reordering in {TSO}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "2",
  pages =        "187--200",
  month =        may,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3140659.3080220",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Sep 15 11:09:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "In Total Store Order memory consistency (TSO), loads
                 can be speculatively reordered to improve performance.
                 If a load-load reordering is seen by other cores,
                 speculative loads must be squashed and re-executed. In
                 architectures with an unordered interconnection network
                 and directory coherence, this has been the established
                 view for decades. We show, for the first time, that it
                 is not necessary to squash and re-execute speculatively
                 reordered loads in TSO when their reordering is seen.
                 Instead, the reordering can be hidden form other cores
                 by the coherence protocol. The implication is that we
                 can irrevocably bind speculative loads. This allows us
                 to commit reordered loads out-of-order without having
                 to wait (for the loads to become non-speculative) or
                 without having to checkpoint committed state (and
                 rollback if needed), just to ensure correctness in the
                 rare case of some core seeing the reordering. We show
                 that by exposing a reordering to the coherence layer
                 and by appropriately modifying a typical directory
                 protocol we can successfully hide load-load reordering
                 without perceptible performance cost and without
                 deadlock. Our solution is cost-effective and increases
                 the performance of out-of-order commit by a sizable
                 margin, compared to the base case where memory
                 operations are not allowed to commit if the consistency
                 model could be violated.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Lee:2017:MVN,
  author =       "Doowon Lee and Valeria Bertacco",
  title =        "{MTraceCheck}: Validating Non-Deterministic Behavior
                 of Memory Consistency Models in Post-Silicon
                 Validation",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "2",
  pages =        "201--213",
  month =        may,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3140659.3080235",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Sep 15 11:09:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "This work presents a minimally-intrusive,
                 high-performance, post-silicon validation framework for
                 validating memory consistency in multi-core systems.
                 Our framework generates constrained-random tests that
                 are instrumented with observability-enhancing code for
                 memory consistency verification. For each test, we
                 generate a set of compact signatures reflecting the
                 memory-ordering patterns observed over many executions
                 of the test, with each of the signatures corresponding
                 to a unique memory-ordering pattern. We then leverage
                 an efficient and novel analysis to quickly determine if
                 the observed execution patterns represented by each
                 unique signature abide by the memory consistency model.
                 Our analysis derives its efficiency by exploiting the
                 structural similarities among the patterns observed. We
                 evaluated our framework, MTraceCheck, on two platforms:
                 an x86-based desktop and an ARM-based SoC platform,
                 both running multi-threaded test programs in a
                 bare-metal environment. We show that MTraceCheck
                 reduces the perturbation introduced by the
                 memory-ordering monitoring activity by 93\% on average,
                 compared to a baseline register flushing approach that
                 saves the register's state after each load operation.
                 We also reduce the computation requirements of our
                 consistency checking analysis by 81\% on average,
                 compared to a conventional topological sorting
                 solution. We finally demonstrate the effectiveness of
                 MTraceCheck on buggy designs, by evaluating multiple
                 case studies where it successfully exposes subtle bugs
                 in a full-system simulation environment.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Zheng:2017:RMA,
  author =       "Ruohuang Zheng and Michael C. Huang",
  title =        "Redundant Memory Array Architecture for Efficient
                 Selective Protection",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "2",
  pages =        "214--227",
  month =        may,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3140659.3080213",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Sep 15 11:09:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Memory hardware errors may result from transient
                 particle-induced faults as well as device defects due
                 to aging. These errors are an important threat to
                 computer system reliability as VLSI technologies
                 continue to scale. Managing memory hardware errors is a
                 critical component in developing an overall system
                 dependability strategy. Memory error detection and
                 correction are supported in a range of available
                 hardware mechanisms. However, memory protections
                 (particularly the more advanced ones) come at
                 substantial costs in performance and energy usage.
                 Moreover, the protection mechanisms are often a fixed,
                 system-wide choice and can not easily adapt to
                 different protection demand of different applications
                 or memory regions. In this paper, we present a new RAIM
                 (redundant array of independent memory) design that
                 compared to the state-of-the-art implementation can
                 easily provide high protection capability and the
                 ability to selectively protect a subset of the memory.
                 A straightforward implementation of the design can
                 incur a substantial memory traffic overhead. We propose
                 a few practical optimizations to mitigate this
                 overhead. With these optimizations the proposed RAIM
                 design offers significant advantages over existing RAIM
                 design at lower or comparable costs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Hicks:2017:CAS,
  author =       "Matthew Hicks",
  title =        "{Clank}: Architectural Support for Intermittent
                 Computation",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "2",
  pages =        "228--240",
  month =        may,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3140659.3080238",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Sep 15 11:09:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "The processors that drive embedded systems are getting
                 smaller; meanwhile, the batteries used to provide power
                 to those systems have stagnated. If we are to realize
                 the dream of ubiquitous computing promised by the
                 Internet of Things, processors must shed large, heavy,
                 expensive, and high maintenance batteries and, instead,
                 harvest energy from their environment. One challenge
                 with this transition is that harvested energy is
                 insufficient for continuous operation. Unfortunately,
                 existing programs fail miserably when executed
                 intermittently. This paper presents Clank: lightweight
                 architectural support for correct and efficient
                 execution of long-running applications on harvested
                 energy---without programmer intervention or extreme
                 hardware modifications. Clank is a set of hardware
                 buffers and memory-access monitors that dynamically
                 maintain idempotency. Essentially, Clank dynamically
                 decomposes program execution into a stream of
                 restartable sub-executions connected via lightweight
                 checkpoints. To validate Clank's ability to correctly
                 stretch program execution across frequent, random power
                 cycles, and to explore the associated hardware and
                 software overheads, we implement Clank in Verilog,
                 formally verify it, and then add it to an ARM Cortex
                 M0+ processor which we use to run a set of 23 embedded
                 systems benchmarks. Experiments show run-time overheads
                 as low as 2.5\%, with run-time overheads of 6\% for a
                 version of Clank that adds 1.7\% hardware. Clank
                 minimizes checkpoints so much that re-execution time
                 becomes the dominate contributor to run-time
                 overhead.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kaliorakis:2017:MED,
  author =       "Manolis Kaliorakis and Dimitris Gizopoulos and Ramon
                 Canal and Antonio Gonzalez",
  title =        "{MeRLiN}: Exploiting Dynamic Instruction Behavior for
                 Fast and Accurate Microarchitecture Level Reliability
                 Assessment",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "2",
  pages =        "241--254",
  month =        may,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3140659.3080225",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Sep 15 11:09:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Early reliability assessment of hardware structures
                 using microarchitecture level simulators can
                 effectively guide major error protection decisions in
                 microprocessor design. Statistical fault injection on
                 microarchitectural structures modeled in performance
                 simulators is an accurate method to measure their
                 Architectural Vulnerability Factor (AVF) but requires
                 excessively long campaigns to obtain high statistical
                 significance. We propose MeRLiN1, a methodology to
                 boost microarchitecture level injection-based
                 reliability assessment by several orders of magnitude
                 and keep the accuracy of the assessment unaffected even
                 for large injection campaigns with very high
                 statistical significance. The core of MeRLiN is the
                 grouping of faults of an initial list in equivalent
                 classes. All faults in the same group target equivalent
                 vulnerable intervals of program execution ending up to
                 the same static instruction that reads the faulty
                 entries. Faults in the same group occur in different
                 times and entries of a structure and it is extremely
                 likely that they all have the same effect in program
                 execution; thus, fault injection is performed only on a
                 few representatives from each group. We evaluate MeRLiN
                 for different sizes of the physical register file, the
                 store queue and the first level data cache of a
                 contemporary microarchitecture running MiBench and SPEC
                 CPU2006 benchmarks. For all our experiments, MeRLiN is
                 from 2 to 3 orders of magnitude faster than an
                 extremely high statistical significant injection
                 campaign, reporting the same reliability measurements
                 with negligible loss of accuracy. Finally, we
                 theoretically analyze MeRLiN's statistical behavior to
                 further justify its accuracy.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Patel:2017:RPR,
  author =       "Minesh Patel and Jeremie S. Kim and Onur Mutlu",
  title =        "The Reach Profiler {(REAPER)}: Enabling the Mitigation
                 of {DRAM} Retention Failures via Profiling at
                 Aggressive Conditions",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "2",
  pages =        "255--268",
  month =        may,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3140659.3080242",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Sep 15 11:09:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Modern DRAM-based systems suffer from significant
                 energy and latency penalties due to conservative DRAM
                 refresh standards. Volatile DRAM cells can retain
                 information across a wide distribution of times ranging
                 from milliseconds to many minutes, but each cell is
                 currently refreshed every 64ms to account for the
                 extreme tail end of the retention time distribution,
                 leading to a high refresh overhead. Due to poor DRAM
                 technology scaling, this problem is expected to get
                 worse in future device generations. Hence, the current
                 approach of refreshing all cells with the worst-case
                 refresh rate must be replaced with a more intelligent
                 design. Many prior works propose reducing the refresh
                 overhead by extending the default refresh interval to a
                 higher value, which we refer to as the target refresh
                 interval, across parts or all of a DRAM chip. These
                 proposals handle the small set of failing cells that
                 cannot retain data throughout the entire extended
                 refresh interval via retention failure mitigation
                 mechanisms (e.g., error correcting codes or bit-repair
                 mechanisms). This set of failing cells is discovered
                 via retention failure profiling, which is currently a
                 brute-force process that writes a set of known data to
                 DRAM, disables refresh and waits for the duration of
                 the target refresh interval, and then checks for
                 retention failures across the DRAM chip. We show that
                 this brute-force approach is too slow and is
                 detrimental to system execution, especially with
                 frequent online profiling. This paper presents reach
                 profiling, a new methodology for retention failure
                 profiling based on the key observation that an
                 overwhelming majority of failing DRAM cells at a target
                 refresh interval fail more reliably at both longer
                 refresh intervals and higher temperatures. Using 368
                 state-of-the-art LPDDR4 DRAM chips from three major
                 vendors, we conduct a thorough experimental
                 characterization of the complex set of tradeoffs
                 inherent in the profiling process. We identify three
                 key metrics to guide design choices for retention
                 failure profiling and mitigation mechanisms: coverage,
                 false positive rate, and runtime. We propose reach
                 profiling, a new retention failure profiling mechanism
                 whose key idea is to profile failing cells at a longer
                 refresh interval and/or higher temperature relative to
                 the target conditions in order to maximize failure
                 coverage while minimizing the false positive rate and
                 profiling runtime. We thoroughly explore the tradeoffs
                 associated with reach profiling and show that there is
                 significant room for improvement in DRAM retention
                 failure profiling beyond the brute-force approach. We
                 show with experimental data that on average, by
                 profiling at 250ms above the target refresh interval,
                 our first implementation of reach profiling (called
                 REAPER) can attain greater than 99\% coverage of
                 failing DRAM cells with less than a 50\% false positive
                 rate while running 2.5x faster than the brute-force
                 approach. In addition, our end-to-end evaluations show
                 that REAPER enables significant system performance
                 improvement and DRAM power reduction, outperforming the
                 brute-force approach and enabling high-performance
                 operation at longer refresh intervals that were
                 previously unreasonable to employ due to the high
                 associated profiling overhead.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Wang:2017:QSS,
  author =       "Zhenning Wang and Jun Yang and Rami Melhem and Bruce
                 Childers and Youtao Zhang and Minyi Guo",
  title =        "Quality of Service Support for Fine-Grained Sharing on
                 {GPUs}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "2",
  pages =        "269--281",
  month =        may,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3140659.3080203",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Sep 15 11:09:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "GPUs have been widely adopted in data centers to
                 provide acceleration services to many applications.
                 Sharing a GPU is increasingly important for better
                 processing throughput and energy efficiency. However,
                 quality of service (QoS) among concurrent applications
                 is minimally supported. Previous efforts are too
                 coarse-grained and not scalable with increasing QoS
                 requirements. We propose QoS mechanisms for a
                 fine-grained form of GPU sharing. Our QoS support can
                 provide control over the progress of kernels on a per
                 cycle basis and the amount of thread-level parallelism
                 of each kernel. Due to accurate resource management,
                 our QoS support has significantly better scalability
                 compared with previous best efforts. Evaluations show
                 that, when the GPU is shared by three kernels, two of
                 which have QoS goals, the proposed techniques achieve
                 QoS goals 43.8\% more often than previous techniques
                 and have 20.5\% higher throughput.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Chen:2017:AGH,
  author =       "Sui Chen and Lu Peng and Samuel Irving",
  title =        "Accelerating {GPU} Hardware Transactional Memory with
                 Snapshot Isolation",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "2",
  pages =        "282--294",
  month =        may,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3140659.3080204",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Sep 15 11:09:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Snapshot Isolation (SI) is an established model in the
                 database community, which permits write-read conflicts
                 to pass and aborts transactions only on write-write
                 conflicts. With the Write Skew anomaly correctly
                 eliminated, SI can reduce the occurrence of aborts,
                 save the work done by transactions, and greatly benefit
                 long transactions involving complex data structures.
                 GPUs are evolving towards a general-purpose computing
                 device with growing support for irregular workloads,
                 including transactional memory. The usage of snapshot
                 isolation on transactional memory has proven to be
                 greatly beneficial for performance. In this paper, we
                 propose a multi-versioned memory subsystem for
                 hardware-based transactional memory on the GPU, with a
                 method for eliminating the Write Skew anomaly on the
                 fly, and finally incorporate Snapshot Isolation with
                 this system. The results show that snapshot isolation
                 can effectively boost the performance of dynamically
                 sized data structures such as linked lists, binary
                 trees and red-black trees, sometimes by as much as
                 4.5x, which results in improved overall performance of
                 benchmarks utilizing these data structures.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Wang:2017:DAC,
  author =       "Kai Wang and Calvin Lin",
  title =        "Decoupled Affine Computation for {SIMT GPUs}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "2",
  pages =        "295--306",
  month =        may,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3140659.3080205",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Sep 15 11:09:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "This paper introduces a method of decoupling affine
                 computations---a class of expressions that produces
                 extremely regular values across SIMT threads---from the
                 main execution stream, so that the affine computations
                 can be performed with greater efficiency and with
                 greater independence from the main execution stream.
                 This decoupling has two benefits: (1) For compute-bound
                 programs, it significantly reduces the dynamic warp
                 instruction count; (2) for memory-bound workloads, it
                 significantly reduces memory latency, since it acts as
                 a non-speculative prefetcher for the data specified by
                 the many memory address calculations that are affine
                 computations. We evaluate our solution, known as
                 Decoupled Affine Computation (DAC), using GPGPU-sim and
                 a set of 29 GPGPU programs. We find that on average,
                 DAC improves performance by 40\% and reduces energy
                 consumption by 20\%. For the 11 compute-bound
                 benchmarks, DAC improves performance by 34\%, compared
                 with 11\% for the previous state-of-the-art. For the 18
                 memory-bound programs, DAC improves performance by an
                 average of 44\%, compared with 16\% for
                 state-of-the-art GPU prefetcher.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Koo:2017:APA,
  author =       "Gunjae Koo and Yunho Oh and Won Woo Ro and Murali
                 Annavaram",
  title =        "Access Pattern-Aware Cache Management for Improving
                 Data Utilization in {GPU}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "2",
  pages =        "307--319",
  month =        may,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3140659.3080239",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Sep 15 11:09:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Long latency of memory operation is a prominent
                 performance bottleneck in graphics processing units
                 (GPUs). The small data cache that must be shared across
                 dozens of warps (a collection of threads) creates
                 significant cache contention and premature data
                 eviction. Prior works have recognized this problem and
                 proposed warp throttling which reduces the number of
                 active warps contending for cache space. In this paper
                 we discover that individual load instructions in a warp
                 exhibit four different types of data locality behavior:
                 (1) data brought by a warp load instruction is used
                 only once, which is classified as streaming data (2)
                 data brought by a warp load is reused multiple times
                 within the same warp, called intra-warp locality (3)
                 data brought by a warp is reused multiple times but
                 across different warps, called inter-warp locality (4)
                 and some data exhibit both a mix of intra- and
                 inter-warp locality. Furthermore, each load instruction
                 exhibits consistently the same locality type across all
                 warps within a GPU kernel. Based on this discovery we
                 argue that cache management must be done using per-load
                 locality type information, rather than applying
                 warp-wide cache management policies. We propose Access
                 Pattern-aware Cache Management (APCM), which
                 dynamically detects the locality type of each load
                 instruction by monitoring the accesses from one
                 exemplary warp. APCM then uses the detected locality
                 type to selectively apply cache bypassing and cache
                 pinning of data based on load locality
                 characterization. Using an extensive set of simulations
                 we show that APCM improves performance of GPUs by 34\%
                 for cache sensitive applications while saving 27\% of
                 energy consumption over baseline GPU.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Arunkumar:2017:MGM,
  author =       "Akhil Arunkumar and Evgeny Bolotin and Benjamin Cho
                 and Ugljesa Milic and Eiman Ebrahimi and Oreste Villa
                 and Aamer Jaleel and Carole-Jean Wu and David Nellans",
  title =        "{MCM-GPU}: Multi-Chip-Module {GPUs} for Continued
                 Performance Scalability",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "2",
  pages =        "320--332",
  month =        may,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3140659.3080231",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Sep 15 11:09:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Historically, improvements in GPU-based high
                 performance computing have been tightly coupled to
                 transistor scaling. As Moore's law slows down, and the
                 number of transistors per die no longer grows at
                 historical rates, the performance curve of single
                 monolithic GPUs will ultimately plateau. However, the
                 need for higher performing GPUs continues to exist in
                 many domains. To address this need, in this paper we
                 demonstrate that package-level integration of multiple
                 GPU modules to build larger logical GPUs can enable
                 continuous performance scaling beyond Moore's law.
                 Specifically, we propose partitioning GPUs into easily
                 manufacturable basic GPU Modules (GPMs), and
                 integrating them on package using high bandwidth and
                 power efficient signaling technologies. We lay out the
                 details and evaluate the feasibility of a basic
                 Multi-Chip-Module GPU (MCM-GPU) design. We then propose
                 three architectural optimizations that significantly
                 improve GPM data locality and minimize the sensitivity
                 on inter-GPM bandwidth. Our evaluation shows that the
                 optimized MCM-GPU achieves 22.8\% speedup and 5x
                 inter-GPM bandwidth reduction when compared to the
                 basic MCM-GPU architecture. Most importantly, the
                 optimized MCM-GPU design is 45.5\% faster than the
                 largest implementable monolithic GPU, and performs
                 within 10\% of a hypothetical (and unbuildable)
                 monolithic GPU. Lastly we show that our optimized
                 MCM-GPU is 26.8\% faster than an equally equipped
                 Multi-GPU system with the same total number of SMs and
                 DRAM bandwidth.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Nazari:2017:EEB,
  author =       "Alireza Nazari and Nader Sehatbakhsh and Monjur Alam
                 and Alenka Zajic and Milos Prvulovic",
  title =        "{EDDIE}: {EM}-Based Detection of Deviations in Program
                 Execution",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "2",
  pages =        "333--346",
  month =        may,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3140659.3080223",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Sep 15 11:09:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "This paper describes EM-Based Detection of Deviations
                 in Program Execution (EDDIE), a new method for
                 detecting anomalies in program execution, such as
                 malware and other code injections, without introducing
                 any overheads, adding any hardware support, changing
                 any software, or using any resources on the monitored
                 system itself. Monitoring with EDDIE involves receiving
                 electromagnetic (EM) emanations that are emitted as a
                 side effect of execution on the monitored system, and
                 it relies on spikes in the EM spectrum that are
                 produced as a result of periodic (e.g. loop) activity
                 in the monitored execution. During training, EDDIE
                 characterizes normal execution behavior in terms of
                 peaks in the EM spectrum that are observed at various
                 points in the program execution, but it does not need
                 any characterization of the malware or other code that
                 might later be injected. During monitoring, EDDIE
                 identifies peaks in the observed EM spectrum, and
                 compares these peaks to those learned during training.
                 Since EDDIE requires no resources on the monitored
                 machine and no changes to the monitored software, it is
                 especially well suited for security monitoring of
                 embedded and IoT devices. We evaluate EDDIE on a real
                 IoT system and in a cycle-accurate simulator, and find
                 that even relatively brief injected bursts of activity
                 (a few milliseconds) are detected by EDDIE with high
                 accuracy, and that it also accurately detects when even
                 a few instructions are injected into an existing loop
                 within the application.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Yan:2017:SHA,
  author =       "Mengjia Yan and Bhargava Gopireddy and Thomas Shull
                 and Josep Torrellas",
  title =        "Secure Hierarchy-Aware Cache Replacement Policy
                 {(SHARP)}: Defending Against Cache-Based Side Channel
                 Atacks",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "2",
  pages =        "347--360",
  month =        may,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3140659.3080222",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Sep 15 11:09:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "In cache-based side channel attacks, a spy that shares
                 a cache with a victim probes cache locations to extract
                 information on the victim's access patterns. For
                 example, in evict+reload, the spy repeatedly evicts and
                 then reloads a probe address, checking if the victim
                 has accessed the address in between the two operations.
                 While there are many proposals to combat these cache
                 attacks, they all have limitations: they either hurt
                 performance, require programmer intervention, or can
                 only defend against some types of attacks. This paper
                 makes the following observation for an environment with
                 an inclusive cache hierarchy: when the spy evicts the
                 probe address from the shared cache, the address will
                 also be evicted from the private cache of the victim
                 process, creating an inclusion victim. Consequently, to
                 disable cache attacks, this paper proposes to alter the
                 line replacement algorithm of the shared cache, to
                 prevent a process from creating inclusion victims in
                 the caches of cores running other processes. By
                 enforcing this rule, the spy cannot evict the probe
                 address from the shared cache and, hence, cannot
                 glimpse any information on the victim's access
                 patterns. We call our proposal SHARP (Secure
                 Hierarchy-Aware cache Replacement Policy). SHARP
                 efficiently defends against all existing cross-core
                 shared-cache attacks, needs only minimal hardware
                 modifications, and requires no code modifications. We
                 implement SHARP in a cycle-level full-system simulator.
                 We show that it protects against real-world attacks,
                 and that it introduces negligible average performance
                 degradation.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Deng:2017:LLH,
  author =       "Zhaoxia Deng and Ariel Feldman and Stuart A. Kurtz and
                 Frederic T. Chong",
  title =        "Lemonade from Lemons: Harnessing Device Wearout to
                 Create Limited-Use Security Architectures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "2",
  pages =        "361--374",
  month =        may,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3140659.3080226",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Sep 15 11:09:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Most architectures are designed to mitigate the
                 usually undesirable phenomenon of device wearout. We
                 take a contrarian view and harness this phenomenon to
                 create hardware security mechanisms that resist attacks
                 by statistically enforcing an upper bound on hardware
                 uses, and consequently attacks. For example, let us
                 assume that a user may log into a smartphone a maximum
                 of 50 times a day for 5 years, resulting in
                 approximately 91,250 legitimate uses. If we assume at
                 least 8-character passwords and we require login (and
                 retrieval of the storage decryption key) to traverse
                 hardware that wears out in 91,250 uses, then an
                 adversary has a negligible chance of successful
                 brute-force attack before the hardware wears out, even
                 assuming real-world password cracking by professionals.
                 M-way replication of our hardware and periodic
                 re-encryption of storage can increase the daily usage
                 bound by a factor of M. The key challenge is to achieve
                 practical statistical bounds on both minimum and
                 maximum uses for an architecture, given that individual
                 devices can vary widely in wearout characteristics. We
                 introduce techniques for architecturally controlling
                 these bounds and perform a design space exploration for
                 three use cases: a limited-use connection, a
                 limited-use targeting system and one-time pads. These
                 techniques include decision trees, parallel structures,
                 Shamir's secret-sharing mechanism, Reed--Solomon codes,
                 and module replication. We explore the cost in area,
                 energy and latency of using these techniques to achieve
                 system-level usage targets given device-level wearout
                 distributions. With redundant encoding, for example, we
                 can improve exponential sensitivity to device lifetime
                 variation to linear sensitivity, reducing the total
                 number of NEMS devices by 4 orders of magnitude to
                 about 0.8 million for limited-use connections (compared
                 with 4 billion if without redundant encoding).",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Altaf:2017:LHL,
  author =       "Muhammad Shoaib Bin Altaf and David A. Wood",
  title =        "{LogCA}: a High-Level Performance Model for Hardware
                 Accelerators",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "2",
  pages =        "375--388",
  month =        may,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3140659.3080216",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Sep 15 11:09:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "With the end of Dennard scaling, architects have
                 increasingly turned to special-purpose hardware
                 accelerators to improve the performance and energy
                 efficiency for some applications. Unfortunately,
                 accelerators don't always live up to their expectations
                 and may under-perform in some situations. Understanding
                 the factors which effect the performance of an
                 accelerator is crucial for both architects and
                 programmers early in the design stage. Detailed models
                 can be highly accurate, but often require low-level
                 details which are not available until late in the
                 design cycle. In contrast, simple analytical models can
                 provide useful insights by abstracting away low-level
                 system details. In this paper, we propose LogCA---a
                 high-level performance model for hardware accelerators.
                 LogCA helps both programmers and architects identify
                 performance bounds and design bottlenecks early in the
                 design cycle, and provide insight into which
                 optimizations may alleviate these bottlenecks. We
                 validate our model across a variety of kernels, ranging
                 from sub-linear to super-linear complexities on both
                 on-chip and off-chip accelerators. We also describe the
                 utility of LogCA using two retrospective case studies.
                 First, we discuss the evolution of interface design in
                 SUN/Oracle's encryption accelerators. Second, we
                 discuss the evolution of memory interface design in
                 three different GPU architectures. In both cases, we
                 show that the adopted design optimizations for these
                 machines are similar to LogCA's suggested
                 optimizations. We argue that architects and programmers
                 can use insights from these retrospective studies for
                 improving future designs.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Prabhakar:2017:PRA,
  author =       "Raghu Prabhakar and Yaqi Zhang and David Koeplinger
                 and Matt Feldman and Tian Zhao and Stefan Hadjis and
                 Ardavan Pedram and Christos Kozyrakis and Kunle
                 Olukotun",
  title =        "{Plasticine}: a Reconfigurable Architecture For
                 Parallel Paterns",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "2",
  pages =        "389--402",
  month =        may,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3140659.3080256",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Sep 15 11:09:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Reconfigurable architectures have gained popularity in
                 recent years as they allow the design of
                 energy-efficient accelerators. Fine-grain fabrics (e.g.
                 FPGAs) have traditionally suffered from performance and
                 power inefficiencies due to bit-level reconfigurable
                 abstractions. Both fine-grain and coarse-grain
                 architectures (e.g. CGRAs) traditionally require low
                 level programming and suffer from long compilation
                 times. We address both challenges with Plasticine, a
                 new spatially reconfigurable architecture designed to
                 efficiently execute applications composed of parallel
                 patterns. Parallel patterns have emerged from recent
                 research on parallel programming as powerful,
                 high-level abstractions that can elegantly capture data
                 locality, memory access patterns, and parallelism
                 across a wide range of dense and sparse applications.
                 We motivate Plasticine by first observing key
                 application characteristics captured by parallel
                 patterns that are amenable to hardware acceleration,
                 such as hierarchical parallelism, data locality, memory
                 access patterns, and control flow. Based on these
                 observations, we architect Plasticine as a collection
                 of Pattern Compute Units and Pattern Memory Units.
                 Pattern Compute Units are multi-stage pipelines of
                 reconfigurable SIMD functional units that can
                 efficiently execute nested patterns. Data locality is
                 exploited in Pattern Memory Units using banked
                 scratchpad memories and configurable address decoders.
                 Multiple on-chip address generators and scatter-gather
                 engines make efficient use of DRAM bandwidth by
                 supporting a large number of outstanding memory
                 requests, memory coalescing, and burst mode for dense
                 accesses. Plasticine has an area footprint of 113 mm2
                 in a 28nm process, and consumes a maximum power of 49 W
                 at a 1 GHz clock. Using a cycle-accurate simulator, we
                 demonstrate that Plasticine provides an improvement of
                 up to 76.9x in performance-per-Watt over a conventional
                 FPGA over a wide range of dense and sparse
                 applications.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kung:2017:PHA,
  author =       "Jaeha Kung and Yun Long and Duckhwan Kim and Saibal
                 Mukhopadhyay",
  title =        "A Programmable Hardware Accelerator for Simulating
                 Dynamical Systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "2",
  pages =        "403--415",
  month =        may,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3140659.3080252",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Sep 15 11:09:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "The fast and energy-efficient simulation of dynamical
                 systems defined by coupled ordinary/partial
                 differential equations has emerged as an important
                 problem. The accelerated simulation of coupled ODE/PDE
                 is critical for analysis of physical systems as well as
                 computing with dynamical systems. This paper presents a
                 fast and programmable accelerator for simulating
                 dynamical systems. The computing model of the proposed
                 platform is based on multilayer cellular nonlinear
                 network (CeNN) augmented with nonlinear function
                 evaluation engines. The platform can be programmed to
                 accelerate wide classes of ODEs/PDEs by modulating the
                 connectivity within the multilayer CeNN engine. An
                 innovative hardware architecture including data reuse,
                 memory hierarchy, and near-memory processing is
                 designed to accelerate the augmented multilayer CeNN. A
                 dataflow model is presented which is supported by
                 optimized memory hierarchy for efficient function
                 evaluation. The proposed solver is designed and
                 synthesized in 15nm technology for the hardware
                 analysis. The performance is evaluated and compared to
                 GPU nodes when solving wide classes of differential
                 equations and the power consumption is analyzed to show
                 orders of magnitude improvement in energy efficiency.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Nowatzki:2017:SDA,
  author =       "Tony Nowatzki and Vinay Gangadhar and Newsha Ardalani
                 and Karthikeyan Sankaralingam",
  title =        "Stream-Dataflow Acceleration",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "2",
  pages =        "416--429",
  month =        may,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3140659.3080255",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Sep 15 11:09:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Demand for low-power data processing hardware
                 continues to rise inexorably. Existing programmable and
                 ``general purpose'' solutions (eg. SIMD, GPGPUs) are
                 insufficient, as evidenced by the order-of-magnitude
                 improvements and industry adoption of application and
                 domain-specific accelerators in important areas like
                 machine learning, computer vision and big data. The
                 stark tradeoffs between efficiency and generality at
                 these two extremes poses a difficult question: how
                 could domain-specific hardware efficiency be achieved
                 without domain-specific hardware solutions? In this
                 work, we rely on the insight that ``acceleratable''
                 algorithms have broad common properties: high
                 computational intensity with long phases, simple
                 control patterns and dependences, and simple streaming
                 memory access and reuse patterns. We define a general
                 architecture (a hardware-software interface) which can
                 more efficiently expresses program with these
                 properties called stream-dataflow. The dataflow
                 component of this architecture enables high
                 concurrency, and the stream component enables
                 communication and coordination at very-low power and
                 area overhead. This paper explores the hardware and
                 software implications, describes its detailed
                 microarchitecture, and evaluates an implementation.
                 Compared to a state-of-the-art domain specific
                 accelerator (DianNao), and fixed-function accelerators
                 for MachSuite, Softbrain can match their performance
                 with only 2x power overhead on average.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Yan:2017:HTC,
  author =       "Zi Yan and J{\'a}n Vesel{\'y} and Guilherme Cox and
                 Abhishek Bhattacharjee",
  title =        "Hardware Translation Coherence for Virtualized
                 Systems",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "2",
  pages =        "430--443",
  month =        may,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3140659.3080211",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Sep 15 11:09:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "To improve system performance, operating systems
                 (OSes) often undertake activities that require
                 modification of virtual-to-physical address
                 translations. For example, the OS may migrate data
                 between physical pages to manage heterogeneous memory
                 devices. We refer to such activities as page
                 remappings. Unfortunately, page remappings are
                 expensive. We show that a big part of this cost arises
                 from address translation coherence, particularly on
                 systems employing virtualization. In response, we
                 propose hardware translation invalidation and coherence
                 or HATRIC, a readily implementable hardware mechanism
                 to piggyback translation coherence atop existing cache
                 coherence protocols. We perform detailed studies using
                 KVM-based virtualization, showing that HATRIC achieves
                 up to 30\% performance and 10\% energy benefits, for
                 per-CPU area overheads of 0.2\%. We also quantify
                 HATRIC's benefits on systems running Xen and find up to
                 33\% performance improvements.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Park:2017:HTC,
  author =       "Chang Hyun Park and Taekyung Heo and Jungi Jeong and
                 Jaehyuk Huh",
  title =        "Hybrid {TLB} Coalescing: Improving {TLB} Translation
                 Coverage under Diverse Fragmented Memory Allocations",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "2",
  pages =        "444--456",
  month =        may,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3140659.3080217",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Sep 15 11:09:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "To mitigate excessive TLB misses in large memory
                 applications, techniques such as large pages, variable
                 length segments, and HW coalescing, increase the
                 coverage of limited hardware translation entries by
                 exploiting the contiguous memory allocation. However,
                 recent studies show that in non-uniform memory systems,
                 using large pages often leads to performance
                 degradation, or allocating large chunks of memory
                 becomes more difficult due to memory fragmentation.
                 Although each of the prior techniques favors its own
                 best chunk size, diverse contiguity of memory
                 allocation in real systems cannot always provide the
                 optimal chunk of each technique. Under such fragmented
                 and diverse memory allocations, this paper proposes a
                 novel HW-SW hybrid translation architecture, which can
                 adapt to different memory mappings efficiently. In the
                 proposed hybrid coalescing technique, the operating
                 system encodes memory contiguity information in a
                 subset of page table entries, called anchor entries.
                 During address translation through TLBs, an anchor
                 entry provides translation for contiguous pages
                 following the anchor entry. As a smaller number of
                 anchor entries can cover a large portion of virtual
                 address space, the efficiency of TLB can be
                 significantly improved. The most important benefit of
                 hybrid coalescing is its ability to change the coverage
                 of the anchor entry dynamically, reflecting the current
                 allocation contiguity status. By using the contiguity
                 information directly set by the operating system, the
                 technique can provide scalable translation coverage
                 improvements with minor hardware changes, while
                 allowing the flexibility of memory allocation. Our
                 experimental results show that across diverse
                 allocation scenarios with different distributions of
                 contiguous memory chunks, the proposed scheme can
                 effectively reap the potential translation coverage
                 improvement from the existing contiguity.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Alam:2017:DIY,
  author =       "Hanna Alam and Tianhao Zhang and Mattan Erez and Yoav
                 Etsion",
  title =        "Do-It-Yourself Virtual Memory Translation",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "2",
  pages =        "457--468",
  month =        may,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3140659.3080209",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Sep 15 11:09:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "In this paper, we introduce the Do-It-Yourself virtual
                 memory translation (DVMT) architecture as a flexible
                 complement for current hardware-fixed translation
                 flows. DVMT decouples the virtual-to-physical mapping
                 process from the access permissions, giving
                 applications freedom in choosing mapping schemes, while
                 maintaining security within the operating system.
                 Furthermore, DVMT is designed to support virtualized
                 environments, as a means to collapse the costly,
                 hardware-assisted two-dimensional translations. We
                 describe the architecture in detail and demonstrate its
                 effectiveness by evaluating several different DVMT
                 schemes on a range of virtualized applications with a
                 model based on measurements from a commercial system.
                 We show that different DVMT configurations preserve the
                 native performance, while achieving speedups of 1.2x to
                 2.0x in virtualized environments.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ryoo:2017:RTD,
  author =       "Jee Ho Ryoo and Nagendra Gulur and Shuang Song and
                 Lizy K. John",
  title =        "Rethinking {TLB} Designs in Virtualized Environments:
                 a Very Large Part-of-Memory {TLB}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "2",
  pages =        "469--480",
  month =        may,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3140659.3080210",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Sep 15 11:09:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "With increasing deployment of virtual machines for
                 cloud services and server applications, memory address
                 translation overheads in virtualized environments have
                 received great attention. In the radix-4 type of page
                 tables used in x86 architectures, a TLB-miss
                 necessitates up to 24 memory references for one guest
                 to host translation. While dedicated page walk caches
                 and such recent enhancements eliminate many of these
                 memory references, our measurements on the Intel
                 Skylake processors indicate that many programs in
                 virtualized mode of execution still spend hundreds of
                 cycles for translations that do not hit in the TLBs.
                 This paper presents an innovative scheme to reduce the
                 cost of address translations by using a very large
                 Translation Lookaside Buffer that is part of memory,
                 the POM-TLB. In the POM-TLB, only one access is
                 required instead of up to 24 accesses required in
                 commonly used 2D walks with radix-4 type of page
                 tables. Even if many of the 24 accesses may hit in the
                 page walk caches, the aggregated cost of the many hits
                 plus the overhead of occasional misses from page walk
                 caches still exceeds the cost of one access to the
                 POM-TLB. Since the POM-TLB is part of the memory space,
                 TLB entries (as opposed to multiple page table entries)
                 can be cached in large L2 and L3 data caches, yielding
                 significant benefits. Through detailed evaluation
                 running SPEC, PARSEC and graph workloads, we
                 demonstrate that the proposed POM-TLB improves
                 performance by approximately 10\% on average. The
                 improvement is more than 16\% for 5 of the benchmarks.
                 It is further seen that a POM-TLB of 16MB size can
                 eliminate nearly all TLB misses in 8-core systems.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kolli:2017:LLP,
  author =       "Aasheesh Kolli and Vaibhav Gogte and Ali Saidi and
                 Stephan Diestelhorst and Peter M. Chen and Satish
                 Narayanasamy and Thomas F. Wenisch",
  title =        "Language-level persistency",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "2",
  pages =        "481--493",
  month =        may,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3140659.3080229",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Sep 15 11:09:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "The commercial release of byte-addressable persistent
                 memories, such as Intel/Micron 3D XPoint memory, is
                 imminent. Ongoing research has sought mechanisms to
                 allow programmers to implement recoverable data
                 structures in these new main memories. Ensuring
                 recoverability requires programmer control of the order
                 of persistent stores; recent work proposes persistency
                 models as an extension to memory consistency to specify
                 such ordering. Prior work has considered persistency
                 models at the abstraction of the instruction set
                 architecture. Instead, we argue for extending the
                 language-level memory model to provide guarantees on
                 the order of persistent writes. We explore a taxonomy
                 of guarantees a language-level persistency model might
                 provide, considering both atomicity and ordering
                 constraints on groups of persistent stores. Then, we
                 propose and evaluate Acquire-Release Persistency (ARP),
                 a language-level persistency model for C++11. We
                 describe how to compile code written for ARP to a
                 state-of-the-art ISA-level persistency model. We then
                 consider enhancements to the ISA-level persistency
                 model that can distinguish memory consistency
                 constraints required for proper synchronization but
                 unnecessary for correct recovery. With these
                 optimizations, we show that ARP increases performance
                 by up to 33.2\% (19.8\% avg.) over coding directly to
                 the baseline ISA-level persistency model for a suite of
                 persistent-write-intensive workloads.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Choi:2017:SAS,
  author =       "Jiho Choi and Thomas Shull and Maria J. Garzaran and
                 Josep Torrellas",
  title =        "{ShortCut}: Architectural Support for Fast Object
                 Access in Scripting Languages",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "2",
  pages =        "494--506",
  month =        may,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3140659.3080237",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Sep 15 11:09:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "The same flexibility that makes dynamic scripting
                 languages appealing to programmers is also the primary
                 cause of their low performance. To access objects of
                 potentially different types, the compiler creates a
                 dispatcher with a series of if statements, each
                 performing a comparison to a type and a jump to a
                 handler. This induces major overhead in instructions
                 executed and branches mispredicted. This paper proposes
                 architectural support to significantly improve the
                 efficiency of accesses to objects. The idea is to
                 modify the instruction that calls the dispatcher so
                 that, under most conditions, it skips most of the
                 branches and instructions needed to reach the correct
                 handler, and sometimes even the execution of the
                 handler itself. Our novel architecture, called
                 ShortCut, performs two levels of optimization. Its
                 Plain design transforms the call to the dispatcher into
                 a call to the correct handler --- bypassing the whole
                 dispatcher execution. Its Aggressive design transforms
                 the call to the dispatcher into a simple load or store
                 --- bypassing the execution of both dispatcher and
                 handler. We implement the ShortCut software in the
                 state-of-the-art Google V8 JIT compiler, and the
                 ShortCut hardware in a simulator. We evaluate ShortCut
                 with the Octane and SunSpider JavaScript application
                 suites. Plain ShortCut reduces the average execution
                 time of the applications by 30\% running under the
                 baseline compiler, and by 11\% running under the
                 maximum level of compiler optimization. Aggressive
                 ShortCut performs only slightly better.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Gope:2017:ASS,
  author =       "Dibakar Gope and David J. Schlais and Mikko H.
                 Lipasti",
  title =        "Architectural Support for Server-Side {PHP}
                 Processing",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "2",
  pages =        "507--520",
  month =        may,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3140659.3080234",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Sep 15 11:09:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "PHP is the dominant server-side scripting language
                 used to implement dynamic web content. Just-in-time
                 compilation, as implemented in Facebook's
                 state-of-the-art HipHopVM, helps mitigate the poor
                 performance of PHP, but substantial overheads remain,
                 especially for realistic, large-scale PHP applications.
                 This paper analyzes such applications and shows that
                 there is little opportunity for conventional
                 microarchitectural enhancements. Furthermore, prior
                 approaches for function-level hardware acceleration
                 present many challenges due to the extremely flat
                 distribution of execution time across a large number of
                 functions in these complex applications. In-depth
                 analysis reveals a more promising alternative: targeted
                 acceleration of four fine-grained PHP activities: hash
                 table accesses, heap management, string manipulation,
                 and regular expression handling. We highlight a set of
                 guiding principles and then propose and evaluate
                 inexpensive hardware accelerators for these activities
                 that accrue substantial performance and energy gains
                 across dozens of functions. Our results reflect an
                 average 17.93\% improvement in performance and 21.01\%
                 reduction in energy while executing these complex PHP
                 workloads on a state-of-the-art software and hardware
                 platform.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kannan:2017:HDH,
  author =       "Sudarsun Kannan and Ada Gavrilovska and Vishal Gupta
                 and Karsten Schwan",
  title =        "{HeteroOS}: {OS} Design for Heterogeneous Memory
                 Management in Datacenter",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "2",
  pages =        "521--534",
  month =        may,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3140659.3080245",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Sep 15 11:09:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "Heterogeneous memory management combined with server
                 virtualization in datacenters is expected to increase
                 the software and OS management complexity.
                 State-of-the-art solutions rely exclusively on the
                 hypervisor (VMM) for expensive page hotness tracking
                 and migrations, limiting the benefits from
                 heterogeneity. To address this, we design HeteroOS, a
                 novel application-transparent OS-level solution for
                 managing memory heterogeneity in virtualized system.
                 The HeteroOS design first makes the guest-OSes
                 heterogeneity-aware and then extracts rich OS-level
                 information about applications' memory usage to place
                 data in the 'right' memory avoiding page migrations.
                 When such pro-active placements are not possible,
                 HeteroOS combines the power of the guest-OSes'
                 information about applications with the VMM's hardware
                 control to track for hotness and migrate only
                 performance-critical pages. Finally, HeteroOS also
                 designs an efficient heterogeneous memory sharing
                 across multiple guest-VMs. Evaluation of HeteroOS with
                 memory, storage, and network-intensive datacenter
                 applications shows up to 2x performance improvement
                 compared to the state-of-the-art VMM-exclusive
                 approach.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Shen:2017:MCA,
  author =       "Yongming Shen and Michael Ferdman and Peter Milder",
  title =        "Maximizing {CNN} Accelerator Efficiency Through
                 Resource Partitioning",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "2",
  pages =        "535--547",
  month =        may,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3140659.3080221",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Sep 15 11:09:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Convolutional neural networks (CNNs) are
                 revolutionizing machine learning, but they present
                 significant computational challenges. Recently, many
                 FPGA-based accelerators have been proposed to improve
                 the performance and efficiency of CNNs. Current
                 approaches construct a single processor that computes
                 the CNN layers one at a time; the processor is
                 optimized to maximize the throughput at which the
                 collection of layers is computed. However, this
                 approach leads to inefficient designs because the same
                 processor structure is used to compute CNN layers of
                 radically varying dimensions. We present a new CNN
                 accelerator paradigm and an accompanying automated
                 design methodology that partitions the available FPGA
                 resources into multiple processors, each of which is
                 tailored for a different subset of the CNN
                 convolutional layers. Using the same FPGA resources as
                 a single large processor, multiple smaller specialized
                 processors increase computational efficiency and lead
                 to a higher overall throughput. Our design methodology
                 achieves 3.8x higher throughput than the
                 state-of-the-art approach on evaluating the popular
                 AlexNet CNN on a Xilinx Virtex-7 FPGA. For the more
                 recent SqueezeNet and GoogLeNet, the speedups are 2.2x
                 and 2.0x.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Yu:2017:SCD,
  author =       "Jiecao Yu and Andrew Lukefahr and David Palframan and
                 Ganesh Dasika and Reetuparna Das and Scott Mahlke",
  title =        "{Scalpel}: Customizing {DNN} Pruning to the Underlying
                 Hardware Parallelism",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "2",
  pages =        "548--560",
  month =        may,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3140659.3080215",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Sep 15 11:09:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "As the size of Deep Neural Networks (DNNs) continues
                 to grow to increase accuracy and solve more complex
                 problems, their energy footprint also scales. Weight
                 pruning reduces DNN model size and the computation by
                 removing redundant weights. However, we implemented
                 weight pruning for several popular networks on a
                 variety of hardware platforms and observed surprising
                 results. For many networks, the network sparsity caused
                 by weight pruning will actually hurt the overall
                 performance despite large reductions in the model size
                 and required multiply-accumulate operations. Also,
                 encoding the sparse format of pruned networks incurs
                 additional storage space overhead. To overcome these
                 challenges, we propose Scalpel that customizes DNN
                 pruning to the underlying hardware by matching the
                 pruned network structure to the data-parallel hardware
                 organization. Scalpel consists of two techniques:
                 SIMD-aware weight pruning and node pruning. For
                 low-parallelism hardware (e.g., microcontroller),
                 SIMD-aware weight pruning maintains weights in aligned
                 fixed-size groups to fully utilize the SIMD units. For
                 high-parallelism hardware (e.g., GPU), node pruning
                 removes redundant nodes, not redundant weights, thereby
                 reducing computation without sacrificing the dense
                 matrix format. For hardware with moderate parallelism
                 (e.g., desktop CPU), SIMD-aware weight pruning and node
                 pruning are synergistically applied together. Across
                 the microcontroller, CPU and GPU, Scalpel achieves mean
                 speedups of 3.54x, 2.61x, and 1.25x while reducing the
                 model sizes by 88\%, 82\%, and 53\%. In comparison,
                 traditional weight pruning achieves mean speedups of
                 1.90x, 1.06x, 0.41x across the three platforms.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Sa:2017:UOA,
  author =       "Christopher {De Sa} and Matthew Feldman and
                 Christopher R{\'e} and Kunle Olukotun",
  title =        "Understanding and Optimizing Asynchronous
                 Low-Precision Stochastic Gradient Descent",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "2",
  pages =        "561--574",
  month =        may,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3140659.3080248",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Sep 15 11:09:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Stochastic gradient descent (SGD) is one of the most
                 popular numerical algorithms used in machine learning
                 and other domains. Since this is likely to continue for
                 the foreseeable future, it is important to study
                 techniques that can make it run fast on parallel
                 hardware. In this paper, we provide the first analysis
                 of a technique called Buck-wild! that uses both
                 asynchronous execution and low-precision computation.
                 We introduce the DMGC model, the first
                 conceptualization of the parameter space that exists
                 when implementing low-precision SGD, and show that it
                 provides a way to both classify these algorithms and
                 model their performance. We leverage this insight to
                 propose and analyze techniques to improve the speed of
                 low-precision SGD. First, we propose software
                 optimizations that can increase throughput on existing
                 CPUs by up to 11X. Second, we propose architectural
                 changes, including a new cache technique we call an
                 obstinate cache, that increase throughput beyond the
                 limits of current-generation hardware. We also
                 implement and analyze low-precision SGD on the FPGA,
                 which is a promising alternative to the CPU for future
                 SGD systems.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Li:2017:API,
  author =       "Zhaoshi Li and Leibo Liu and Yangdong Deng and Shouyi
                 Yin and Yao Wang and Shaojun Wei",
  title =        "Aggressive Pipelining of Irregular Applications on
                 Reconfigurable Hardware",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "2",
  pages =        "575--586",
  month =        may,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3140659.3080228",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Sep 15 11:09:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "CPU-FPGA heterogeneous platforms offer a promising
                 solution for high-performance and energy-efficient
                 computing systems by providing specialized accelerators
                 with post-silicon reconfigurability. To unleash the
                 power of FPGA, however, the programmability gap has to
                 be filled so that applications specified in high-level
                 programming languages can be efficiently mapped and
                 scheduled on FPGA. The above problem is even more
                 challenging for irregular applications, in which the
                 execution dependency can only be determined at run
                 time. Thus over-serialized accelerators are generated
                 from existing works that rely on compile time analysis
                 to schedule the computation. In this work, we propose a
                 comprehensive software-hardware co-design framework,
                 which captures parallelism in irregular applications
                 and aggressively schedules pipelined execution on
                 reconfigurable platform. Based on an inherently
                 parallel abstraction packaging parallelism for runtime
                 schedule, our framework significantly differs from
                 existing works that tend to schedule executions at
                 compile time. An irregular application is formulated as
                 a set of tasks with their dependencies specified as
                 rules describing the conditions under which a subset of
                 tasks can be executed concurrently. Then datapaths on
                 FPGA will be generated by transforming applications in
                 the formulation into task pipelines orchestrated by
                 evaluating rules at runtime, which could exploit
                 fine-grained pipeline parallelism as handcrafted
                 accelerators do. An evaluation shows that this
                 framework is able to produce datapath with its quality
                 close to handcrafted designs. Experiments show that
                 generated accelerators are dramatically more efficient
                 than those created by current high-level synthesis
                 tools. Meanwhile, accelerators generated for a set of
                 irregular applications attain 0.5x~1.9x performance
                 compared to equivalent software implementations we
                 selected on a server-grade 10-core processor, with the
                 memory subsystem remaining as the bottleneck.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Subramanian:2017:FEM,
  author =       "Suvinay Subramanian and Mark C. Jeffrey and Maleen
                 Abeydeera and Hyun Ryong Lee and Victor A. Ying and
                 Joel Emer and Daniel Sanchez",
  title =        "Fractal: an Execution Model for Fine-Grain Nested
                 Speculative Parallelism",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "2",
  pages =        "587--599",
  month =        may,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3140659.3080218",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Sep 15 11:09:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Most systems that support speculative parallelization,
                 like hardware transactional memory (HTM), do not
                 support nested parallelism. This sacrifices substantial
                 parallelism and precludes composing parallel
                 algorithms. And the few HTMs that do support nested
                 parallelism focus on parallelizing at the coarsest
                 (shallowest) levels, incurring large overheads that
                 squander most of their potential. We present FRACTAL, a
                 new execution model that supports unordered and
                 timestamp-ordered nested parallelism. FRACTAL lets
                 programmers seamlessly compose speculative parallel
                 algorithms, and lets the architecture exploit
                 parallelism at all levels. FRACTAL can parallelize a
                 broader range of applications than prior speculative
                 execution models. We design a FRACTAL implementation
                 that extends the Swarm architecture and focuses on
                 parallelizing at the finest (deepest) levels. Our
                 approach sidesteps the issues of nested parallel HTMs
                 and uncovers abundant fine-grain parallelism. As a
                 result, FRACTAL outperforms prior speculative
                 architectures by up to 88x at 256 cores.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Subramaniyan:2017:PAP,
  author =       "Arun Subramaniyan and Reetuparna Das",
  title =        "Parallel Automata Processor",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "2",
  pages =        "600--612",
  month =        may,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3140659.3080207",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Sep 15 11:09:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Finite State Machines (FSM) are widely used
                 computation models for many application domains. These
                 embarrassingly sequential applications with irregular
                 memory access patterns perform poorly on conventional
                 von-Neumann architectures. The Micron Automata
                 Processor (AP) is an in-situ memory-based computational
                 architecture that accelerates non-deterministic finite
                 automata (NFA) processing in hardware. However, each
                 FSM on the AP is processed sequentially, limiting
                 potential speedups. In this paper, we explore the FSM
                 parallelization problem in the context of the AP.
                 Extending classical parallelization techniques to NFAs
                 executing on AP is non-trivial because of high
                 state-transition tracking overheads and exponential
                 computation complexity. We present the associated
                 challenges and propose solutions that leverage both the
                 unique properties of the NFAs (connected components,
                 input symbol ranges, convergence, common parent states)
                 and unique features in the AP (support for simultaneous
                 transitions, low-overhead flow switching, state vector
                 cache) to realize parallel NFA execution on the AP. We
                 evaluate our techniques against several important
                 benchmarks including NFAs used for network intrusion
                 detection, malware detection, text processing, protein
                 motif searching, DNA sequencing, and data analytics.
                 Our proposed parallelization scheme demonstrates
                 significant speedup (25.5x on average) compared to
                 sequential execution on AP. Prior work has already
                 shown that sequential execution on AP is at least an
                 order of magnitude better than GPUs, multi-core
                 processors and Xeon Phi accelerator.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Kateja:2017:VDB,
  author =       "Rajat Kateja and Anirudh Badam and Sriram Govindan and
                 Bikash Sharma and Greg Ganger",
  title =        "{Viyojit}: Decoupling Battery and {DRAM} Capacities
                 for Battery-Backed {DRAM}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "2",
  pages =        "613--626",
  month =        may,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3140659.3080236",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Sep 15 11:09:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Non-Volatile Memories (NVMs) can significantly improve
                 the performance of data-intensive applications. A
                 popular form of NVM is Battery-backed DRAM, which is
                 available and in use today with DRAMs latency and
                 without the endurance problems of emerging NVM
                 technologies. Modern servers can be provisioned with
                 up-to 4 TB of DRAM, and provisioning battery backup to
                 write out such large memories is hard because of the
                 large battery sizes and the added hardware and cooling
                 costs. We present Viyojit, a system that exploits the
                 skew in write working sets of applications to provision
                 substantially smaller batteries while still ensuring
                 durability for the entire DRAM capacity. Viyojit
                 achieves this by bounding the number of dirty pages in
                 DRAM based on the provisioned battery capacity and
                 proactively writing out infrequently written pages to
                 an SSD. Even for write-heavy workloads with less skew
                 than we observe in analysis of real data center traces,
                 Viyojit reduces the required battery capacity to 11\%
                 of the original size, with a performance overhead of
                 7-25\%. Thus, Viyojit frees battery-backed DRAM from
                 stunted growth of battery capacities and enables
                 servers with terabytes of battery-backed DRAM.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Young:2017:DCD,
  author =       "Vinson Young and Prashant J. Nair and Moinuddin K.
                 Qureshi",
  title =        "{DICE}: Compressing {DRAM} Caches for Bandwidth and
                 Capacity",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "2",
  pages =        "627--638",
  month =        may,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3140659.3080243",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Sep 15 11:09:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "This paper investigates compression for DRAM caches.
                 As the capacity of DRAM cache is typically large, prior
                 techniques on cache compression, which solely focus on
                 improving cache capacity, provide only a marginal
                 benefit. We show that more performance benefit can be
                 obtained if the compression of the DRAM cache is
                 tailored to provide higher bandwidth. If a DRAM cache
                 can provide two compressed lines in a single access,
                 and both lines are useful, the effective bandwidth of
                 the DRAM cache would double. Unfortunately, it is not
                 straight-forward to compress DRAM caches for bandwidth.
                 The typically used Traditional Set Indexing (TSI) maps
                 consecutive lines to consecutive sets, so the multiple
                 compressed lines obtained from the set are from
                 spatially distant locations and unlikely to be used
                 within a short period of each other. We can change the
                 indexing of the cache to place consecutive lines in the
                 same set to improve bandwidth; however, when the data
                 is incompressible, such spatial indexing reduces
                 effective capacity and causes significant slowdown.
                 Ideally, we would like to have spatial indexing when
                 the data is compressible and TSI otherwise. To this
                 end, we propose Dynamic-Indexing Cache comprEssion
                 (DICE), a dynamic design that can adapt between spatial
                 indexing and TSI, depending on the compressibility of
                 the data. We also propose low-cost Cache Index
                 Predictors (CIP) that can accurately predict the cache
                 indexing scheme on access in order to avoid probing
                 both indices for retrieving a given cache line. Our
                 studies with a 1GB DRAM cache, on a wide range of
                 workloads (including SPEC and Graph), show that DICE
                 improves performance by 19.0\% and reduces
                 energy-delay-product by 36\% on average. DICE is within
                 3\% of a design that has double the capacity and double
                 the bandwidth. DICE incurs a storage overhead of less
                 than 1KB and does not rely on any OS support.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Drumond:2017:MDE,
  author =       "Mario Drumond and Alexandros Daglis and Nooshin
                 Mirzadeh and Dmitrii Ustiugov and Javier Picorel and
                 Babak Falsafi and Boris Grot and Dionisios
                 Pnevmatikatos",
  title =        "The {Mondrian Data Engine}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "2",
  pages =        "639--651",
  month =        may,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3140659.3080233",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Sep 15 11:09:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "The increasing demand for extracting value out of
                 ever-growing data poses an ongoing challenge to system
                 designers, a task only made trickier by the end of
                 Dennard scaling. As the performance density of
                 traditional CPU-centric architectures stagnates,
                 advancing compute capabilities necessitates novel
                 architectural approaches. Near-memory processing (NMP)
                 architectures are reemerging as promising candidates to
                 improve computing efficiency through tight coupling of
                 logic and memory. NMP architectures are especially
                 fitting for data analytics, as they provide immense
                 bandwidth to memory-resident data and dramatically
                 reduce data movement, the main source of energy
                 consumption. Modern data analytics operators are
                 optimized for CPU execution and hence rely on large
                 caches and employ random memory accesses. In the
                 context of NMP, such random accesses result in wasteful
                 DRAM row buffer activations that account for a
                 significant fraction of the total memory access energy.
                 In addition, utilizing NMP's ample bandwidth with
                 fine-grained random accesses requires complex hardware
                 that cannot be accommodated under NMP's tight area and
                 power constraints. Our thesis is that efficient NMP
                 calls for an algorithm-hardware co-design that favors
                 algorithms with sequential accesses to enable simple
                 hardware that accesses memory in streams. We introduce
                 an instance of such a co-designed NMP architecture for
                 data analytics, the Mondrian Data Engine. Compared to a
                 CPU-centric and a baseline NMP system, the Mondrian
                 Data Engine improves the performance of basic data
                 analytics operators by up to 49x and 5x, and efficiency
                 by up to 28x and 5x, respectively.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Tsai:2017:JSD,
  author =       "Po-An Tsai and Nathan Beckmann and Daniel Sanchez",
  title =        "{Jenga}: Software-Defined Cache Hierarchies",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "2",
  pages =        "652--665",
  month =        may,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3140659.3080214",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Sep 15 11:09:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "Caches are traditionally organized as a rigid
                 hierarchy, with multiple levels of progressively larger
                 and slower memories. Hierarchy allows a simple, fixed
                 design to benefit a wide range of applications, since
                 working sets settle at the smallest (i.e., fastest and
                 most energy-efficient) level they fit in. However,
                 rigid hierarchies also add overheads, because each
                 level adds latency and energy even when it does not fit
                 the working set. These overheads are expensive on
                 emerging systems with heterogeneous memories, where the
                 differences in latency and energy across levels are
                 small. Significant gains are possible by specializing
                 the hierarchy to applications. We propose Jenga, a
                 reconfigurable cache hierarchy that dynamically and
                 transparently specializes itself to applications. Jenga
                 builds virtual cache hierarchies out of heterogeneous,
                 distributed cache banks using simple hardware
                 mechanisms and an OS runtime. In contrast to prior
                 techniques that trade energy and bandwidth for
                 performance (e.g., dynamic bypassing or prefetching),
                 Jenga eliminates accesses to unwanted cache levels.
                 Jenga thus improves both performance and energy
                 efficiency. On a 36-core chip with a 1 GB DRAM cache,
                 Jenga improves energy-delay product over a combination
                 of state-of-the-art techniques by 23\% on average and
                 by up to 85\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Boyapati:2017:AND,
  author =       "Rahul Boyapati and Jiayi Huang and Pritam Majumder and
                 Ki Hwan Yum and Eun Jung Kim",
  title =        "{APPROX-NoC}: a Data Approximation Framework for
                 Network-On-Chip Architectures",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "2",
  pages =        "666--677",
  month =        may,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3140659.3080241",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Sep 15 11:09:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "The trend of unsustainable power consumption and large
                 memory bandwidth demands in massively parallel
                 multicore systems, with the advent of the big data era,
                 has brought upon the onset of alternate computation
                 paradigms utilizing heterogeneity, specialization,
                 processor-in-memory and approximation. Approximate
                 Computing is being touted as a viable solution for high
                 performance computation by relaxing the accuracy
                 constraints of applications. This trend has been
                 accentuated by emerging data intensive applications in
                 domains like image/video processing, machine learning
                 and big data analytics that allow inaccurate outputs
                 within an acceptable variance. Leveraging relaxed
                 accuracy for high throughput in Networks-on-Chip
                 (NoCs), which have rapidly become the accepted method
                 for connecting a large number of on-chip components,
                 has not yet been explored. We propose APPROX-NoC, a
                 hardware data approximation framework with an online
                 data error control mechanism for high performance NoCs.
                 APPROX-NoC facilitates approximate matching of data
                 patterns, within a controllable value range, to
                 compress them thereby reducing the volume of data
                 movement across the chip. Our evaluation shows that
                 APPROX-NoC achieves on average up to 9\% latency
                 reduction and 60\% throughput improvement compared with
                 state-of-the-art NoC data compression mechanisms, while
                 maintaining low application error. Additionally, with a
                 data intensive graph processing application we achieve
                 a 36.7\% latency reduction compared to state-of-the-art
                 compression mechanisms.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Poremba:2017:TBA,
  author =       "Matthew Poremba and Itir Akgun and Jieming Yin and
                 Onur Kayiran and Yuan Xie and Gabriel H. Loh",
  title =        "There and Back Again: Optimizing the Interconnect in
                 Networks of Memory Cubes",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "2",
  pages =        "678--690",
  month =        may,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3140659.3080251",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Sep 15 11:09:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "High-performance computing, enterprise, and datacenter
                 servers are driving demands for higher total memory
                 capacity as well as memory performance. Memory
                 ``cubes'' with high per-package capacity (from 3D
                 integration) along with high-speed point-to-point
                 interconnects provide a scalable memory system
                 architecture with the potential to deliver both
                 capacity and performance. Multiple such cubes connected
                 together can form a ``Memory Network'' (MN), but the
                 design space for such MNs is quite vast, including
                 multiple topology types and multiple memory
                 technologies per memory cube. In this work, we first
                 analyze several MN topologies with different mixes of
                 memory package technologies to understand the key
                 tradeoffs and bottlenecks for such systems. We find
                 that most of a MN's performance challenges arise from
                 the interconnection network that binds the memory cubes
                 together. In particular, arbitration schemes used to
                 route through MNs, ratio of NVM to DRAM, and specific
                 topologies used have dramatic impact on performance and
                 energy results. Our initial analysis indicates that
                 introducing non-volatile memory to the MN presents a
                 unique tradeoff between memory array latency and
                 network latency. We observe that placing NVM cubes in a
                 specific order in the MN improves performance by
                 reducing the network size/diameter up to a certain NVM
                 to DRAM ratio. Novel MN topologies and arbitration
                 schemes also provide performance and energy deltas by
                 reducing the hop count of requests and response in the
                 MN. Based on our analyses, we introduce three
                 techniques to address MN latency issues: (1)
                 Distance-based arbitration scheme to improve queuing
                 latencies throughout the network, (2) skip-list
                 topology, derived from the classic data structure, to
                 improve network latency and link usage, and (3) the
                 MetaCube, a denser memory cube that leverages advanced
                 packaging technologies to improve latency by reducing
                 MN size.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Fu:2017:FRR,
  author =       "Binzhang Fu and John Kim",
  title =        "{Footprint}: Regulating Routing Adaptiveness in
                 Networks-on-Chip",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "2",
  pages =        "691--702",
  month =        may,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3140659.3080249",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Sep 15 11:09:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Routing algorithms can improve network performance by
                 maximizing routing adaptiveness but can be problematic
                 in the presence of endpoint congestion. Tree-saturation
                 is a well-known behavior caused by endpoint congestion.
                 Adaptive routing can, however, spread the congestion
                 and result in thick branches of the congestion tree ---
                 creating Head-of-Line (HoL) blocking and degrading
                 performance. In this work, we identify how ignoring
                 virtual channels (VCs) and their occupancy during
                 adaptive routing results in congestion trees with thick
                 branches as congestion is spread to all VCs. To address
                 this limitation, we propose Footprint routing algorithm
                 --- a new adaptive routing algorithm that minimizes the
                 size of the congestion tree, both in terms of the
                 number of nodes in the congestion tree as well as
                 branch thickness. Footprint achieves this by regulating
                 adaptiveness by requiring packets to follow the path of
                 prior packets to the same destination if the network is
                 congested instead of forking a new path or VC. Thus,
                 the congestion tree is dynamically kept as slim as
                 possible and reduces HoL blocking or congestion
                 spreading while maintaining high adaptivity and
                 maximizing VC buffer utilization. We evaluate the
                 proposed Footprint routing algorithm against other
                 adaptive routing algorithms and our simulation results
                 show that the network saturation throughput can be
                 improved by up to 43\% (58\%) compared with the fully
                 adaptive routing (partially adaptive routing)
                 algorithms.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Article{Ebrahimi:2017:ENT,
  author =       "Masoumeh Ebrahimi and Masoud Daneshtalab",
  title =        "{EbDa}: a New Theory on Design and Verification of
                 Deadlock-free Interconnection Networks",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "2",
  pages =        "703--715",
  month =        may,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3140659.3080253",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Fri Sep 15 11:09:14 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Freedom from deadlock is one of the most important
                 issues when designing routing algorithms in
                 on-chip/off-chip networks. Many works have been
                 developed upon Dally's theory proving that a network is
                 deadlock-free if there is no cyclic dependency on the
                 channel dependency graph. However, finding such acyclic
                 graph has been very challenging, which limits Dally's
                 theory to networks with a low number of channels. In
                 this paper, we introduce three theorems that directly
                 lead to routing algorithms with an acyclic channel
                 dependency graph. We also propose the partitioning
                 methodology, enabling a design to reach the maximum
                 adaptiveness for the n-dimensional mesh and k-ary
                 n-cube topologies with any given number of channels. In
                 addition, deadlock-free routing algorithms can be
                 derived ranging from maximally fully adaptive routing
                 down to deterministic routing. The proposed theorems
                 can drastically remove the difficulties of designing
                 deadlock-free routing algorithms.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

%%% ====================================================================
%%% Conference proceedings papers not included in regular issues:

@InProceedings{Lipovski:1998:RBN,
  author =       "Jack Lipovski",
  title =        "Retrospective: {Banyan} networks for partitioning
                 multiprocessor systems",
  crossref =     "ACM:1998:PAI",
  pages =        "1--1",
  year =         "1998",
  bibdate =      "Fri May 12 17:56:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  remark =       "25 years of the International Symposia on Computer
                 Architecture (selected papers).",
}

@InProceedings{Dennis:1998:RPA,
  author =       "Jack B. Dennis",
  title =        "Retrospective: a preliminary architecture for a basic
                 data flow processor",
  crossref =     "ACM:1998:PAI",
  pages =        "2--4",
  year =         "1998",
  bibdate =      "Fri May 12 17:56:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  remark =       "25 years of the International Symposia on Computer
                 Architecture (selected papers).",
}

@InProceedings{Patel:1998:RIT,
  author =       "Janak H. Patel",
  title =        "Retrospective: {Improving} the throughput of a
                 pipeline by insertion of delays",
  crossref =     "ACM:1998:PAI",
  pages =        "5--5",
  year =         "1998",
  bibdate =      "Fri May 12 17:56:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  remark =       "25 years of the International Symposia on Computer
                 Architecture (selected papers).",
}

@InProceedings{Bell:1998:RWW,
  author =       "Gorden Bell and W. D. Strecker",
  title =        "Retrospective: {What} have we learned from the
                 {PDP-11} --- what we have learned from {VAX} and
                 {Alpha}",
  crossref =     "ACM:1998:PAI",
  pages =        "6--10",
  year =         "1998",
  bibdate =      "Fri May 12 17:56:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  remark =       "25 years of the International Symposia on Computer
                 Architecture (selected papers).",
}

@InProceedings{Shustek:1998:RIT,
  author =       "Leonard J. Shustek and Bernard L. Peuto",
  title =        "Retrospective: an instruction timing model of {CPU}
                 performance",
  crossref =     "ACM:1998:PAI",
  pages =        "11--12",
  year =         "1998",
  bibdate =      "Fri May 12 17:56:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  remark =       "25 years of the International Symposia on Computer
                 Architecture (selected papers).",
}

@InProceedings{Ditzel:1998:RRH,
  author =       "David R. Ditzel and David A. Patterson",
  title =        "Retrospective: a retrospective on high-level language
                 computer architecture",
  crossref =     "ACM:1998:PAI",
  pages =        "13--14",
  year =         "1998",
  bibdate =      "Fri May 12 17:56:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  remark =       "25 years of the International Symposia on Computer
                 Architecture (selected papers).",
}

@InProceedings{Batcher:1998:RAM,
  author =       "Ken Batcher",
  title =        "Retrospective: {Architecture} of a massively parallel
                 processor",
  crossref =     "ACM:1998:PAI",
  pages =        "15--16",
  year =         "1998",
  bibdate =      "Fri May 12 17:56:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  remark =       "25 years of the International Symposia on Computer
                 Architecture (selected papers).",
}

@InProceedings{Pier:1998:RPH,
  author =       "Ken Pier",
  title =        "Retrospective: a processor for a high-performance
                 personal computer",
  crossref =     "ACM:1998:PAI",
  pages =        "17--19",
  year =         "1998",
  bibdate =      "Fri May 12 17:56:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  remark =       "25 years of the International Symposia on Computer
                 Architecture (selected papers).",
}

@InProceedings{Kroft:1998:RLF,
  author =       "David Kroft",
  title =        "Retrospective: {Lockup}-free instruction fetch\slash
                 prefetch cache organization",
  crossref =     "ACM:1998:PAI",
  pages =        "20--21",
  year =         "1998",
  bibdate =      "Fri May 12 17:56:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  remark =       "25 years of the International Symposia on Computer
                 Architecture (selected papers).",
}

@InProceedings{Smith:1998:RSB,
  author =       "James E. Smith",
  title =        "Retrospective: a study of branch prediction
                 strategies",
  crossref =     "ACM:1998:PAI",
  pages =        "22--23",
  year =         "1998",
  bibdate =      "Fri May 12 17:56:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  remark =       "25 years of the International Symposia on Computer
                 Architecture (selected papers).",
}

@InProceedings{Patterson:1998:RRR,
  author =       "David A. Patterson and Carlo H. S{\'e}quin",
  title =        "Retrospective: {RISC I}: a {Reduced Instruction Set
                 Computer}",
  crossref =     "ACM:1998:PAI",
  pages =        "24--26",
  year =         "1998",
  bibdate =      "Fri May 12 17:56:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "This paper contains in column 1, page 25, the story of
                 the origin of the name ``RISC''.",
  acknowledgement = ack-nhfb,
  remark =       "25 years of the International Symposia on Computer
                 Architecture (selected papers).",
}

@InProceedings{Smith:1998:RDA,
  author =       "James E. Smith",
  title =        "Retrospective: {Decoupled} access\slash execute
                 architectures",
  crossref =     "ACM:1998:PAI",
  pages =        "27--28",
  year =         "1998",
  bibdate =      "Fri May 12 17:56:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  remark =       "25 years of the International Symposia on Computer
                 Architecture (selected papers).",
}

@InProceedings{Gottlieb:1998:RPR,
  author =       "Allan Gottlieb",
  title =        "Retrospective: a personal retrospective on the {NYU}
                 ultracomputer",
  crossref =     "ACM:1998:PAI",
  pages =        "29--31",
  year =         "1998",
  bibdate =      "Fri May 12 17:56:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  remark =       "25 years of the International Symposia on Computer
                 Architecture (selected papers).",
}

@InProceedings{Goodman:1998:RUC,
  author =       "James R. Goodman",
  title =        "Retrospective: {Using} cache memory to reduce
                 processor-memory traffic",
  crossref =     "ACM:1998:PAI",
  pages =        "32--33",
  year =         "1998",
  bibdate =      "Fri May 12 17:56:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  remark =       "25 years of the International Symposia on Computer
                 Architecture (selected papers).",
}

@InProceedings{Fisher:1998:RVL,
  author =       "Joseph A. Fisher",
  title =        "Retrospective: {Very} long instruction word
                 architectures and the {ELI}-512",
  crossref =     "ACM:1998:PAI",
  pages =        "34--36",
  year =         "1998",
  bibdate =      "Fri May 12 17:56:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  remark =       "25 years of the International Symposia on Computer
                 Architecture (selected papers).",
}

@InProceedings{Emer:1998:RCP,
  author =       "Joel S. Emer and Douglas W. Clark",
  title =        "Retrospective: {Characterization} of processor
                 performance in the {VAX-11\slash 780}",
  crossref =     "ACM:1998:PAI",
  pages =        "37--38",
  year =         "1998",
  bibdate =      "Fri May 12 17:56:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  remark =       "25 years of the International Symposia on Computer
                 Architecture (selected papers).",
}

@InProceedings{Patel:1998:RLO,
  author =       "Janak H. Patel",
  title =        "Retrospective: a low-overhead coherence solution for
                 multiprocessors with private cache memories",
  crossref =     "ACM:1998:PAI",
  pages =        "39--41",
  year =         "1998",
  bibdate =      "Fri May 12 17:56:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  remark =       "25 years of the International Symposia on Computer
                 Architecture (selected papers).",
}

@InProceedings{Smith:1998:RIP,
  author =       "James E. Smith",
  title =        "Retrospective: {Implementing} precise interrupts in
                 pipelined processors",
  crossref =     "ACM:1998:PAI",
  pages =        "42--42",
  year =         "1998",
  bibdate =      "Fri May 12 17:56:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  remark =       "25 years of the International Symposia on Computer
                 Architecture (selected papers).",
}

@InProceedings{Hwu:1998:RHH,
  author =       "Wen-mei W. Hwu and Yale N. Patt",
  title =        "Retrospective: {HPSm}, a high performance restricted
                 data flow architecture having minimal functionality",
  crossref =     "ACM:1998:PAI",
  pages =        "43--44",
  year =         "1998",
  bibdate =      "Fri May 12 17:56:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  remark =       "25 years of the International Symposia on Computer
                 Architecture (selected papers).",
}

@InProceedings{Gross:1998:RRW,
  author =       "Thomas Gross and Monica Lam",
  title =        "Retrospective: a retrospective on the {Warp}
                 machines",
  crossref =     "ACM:1998:PAI",
  pages =        "45--47",
  year =         "1998",
  bibdate =      "Fri May 12 17:56:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  remark =       "25 years of the International Symposia on Computer
                 Architecture (selected papers).",
}

@InProceedings{Dubois:1998:RMA,
  author =       "Michel Dubois and Christoph Scheurich",
  title =        "Retrospective: {Memory} access buffering in
                 multiprocessors",
  crossref =     "ACM:1998:PAI",
  pages =        "48--50",
  year =         "1998",
  bibdate =      "Fri May 12 17:56:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  remark =       "25 years of the International Symposia on Computer
                 Architecture (selected papers).",
}

@InProceedings{Sohi:1998:RII,
  author =       "Gurindar S. Sohi",
  title =        "Retrospective: {Instruction} issue logic for
                 high-performance, interruptible pipelined processors",
  crossref =     "ACM:1998:PAI",
  pages =        "51--53",
  year =         "1998",
  bibdate =      "Fri May 12 17:56:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  remark =       "25 years of the International Symposia on Computer
                 Architecture (selected papers).",
}

@InProceedings{Dally:1998:RJM,
  author =       "William J. Dally and Andrew Chien and Stuart Fiske and
                 Waldemar Horwat and Richard Lethin and Michael Noakes
                 and Peter Nuth and Ellen Spertus and Deborah Wallach
                 and D. Scott Wills and Andrew Chang and John Keen",
  title =        "Retrospective: {The} {J}-machine",
  crossref =     "ACM:1998:PAI",
  pages =        "54--58",
  year =         "1998",
  bibdate =      "Fri May 12 17:56:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  remark =       "25 years of the International Symposia on Computer
                 Architecture (selected papers).",
}

@InProceedings{Baer:1998:RIP,
  author =       "Jean-Loup Baer and Wen-Hann Wang",
  title =        "Retrospective: {On} the inclusion properties for
                 multi-level cache hierarchies",
  crossref =     "ACM:1998:PAI",
  pages =        "59--60",
  year =         "1998",
  bibdate =      "Fri May 12 17:56:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  remark =       "25 years of the International Symposia on Computer
                 Architecture (selected papers).",
}

@InProceedings{Hennessy:1998:RED,
  author =       "John Hennessy",
  title =        "Retrospective: {Evaluation} of directory schemes for
                 cache coherence",
  crossref =     "ACM:1998:PAI",
  pages =        "61--62",
  year =         "1998",
  bibdate =      "Fri May 12 17:56:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  remark =       "25 years of the International Symposia on Computer
                 Architecture (selected papers).",
}

@InProceedings{Adve:1998:RWO,
  author =       "Sarita V. Adve and Mark D. Hill",
  title =        "Retrospective: {Weak} ordering --- a new definition",
  crossref =     "ACM:1998:PAI",
  pages =        "63--66",
  year =         "1998",
  bibdate =      "Fri May 12 17:56:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  remark =       "25 years of the International Symposia on Computer
                 Architecture (selected papers).",
}

@InProceedings{Gharachorloo:1998:RMC,
  author =       "Kourosh Gharachorloo",
  title =        "Retrospective: {Memory} consistency and event ordering
                 in scalable shared-memory multiprocessors",
  crossref =     "ACM:1998:PAI",
  pages =        "67--70",
  year =         "1998",
  bibdate =      "Fri May 12 17:56:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  remark =       "25 years of the International Symposia on Computer
                 Architecture (selected papers).",
}

@InProceedings{Jouppi:1998:RID,
  author =       "Norman P. Jouppi",
  title =        "Retrospective: {Improving} direct-mapped cache
                 performance by the addition of a small
                 fully-associative cache and prefetch buffers",
  crossref =     "ACM:1998:PAI",
  pages =        "71--73",
  year =         "1998",
  bibdate =      "Fri May 12 17:56:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  remark =       "25 years of the International Symposia on Computer
                 Architecture (selected papers).",
}

@InProceedings{Papadopoulos:1998:RME,
  author =       "George M. Papadopoulos and David E. Culler",
  title =        "Retrospective: {Monsoon}: an explicit token-store
                 architecture",
  crossref =     "ACM:1998:PAI",
  pages =        "74--76",
  year =         "1998",
  bibdate =      "Fri May 12 17:56:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  remark =       "25 years of the International Symposia on Computer
                 Architecture (selected papers).",
}

@InProceedings{Hwu:1998:RIA,
  author =       "Wen-mei W. Hwu",
  title =        "Retrospective: {Impact}: an architectural framework
                 for multiple-instruction issue",
  crossref =     "ACM:1998:PAI",
  pages =        "77--79",
  year =         "1998",
  bibdate =      "Fri May 12 17:56:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  remark =       "25 years of the International Symposia on Computer
                 Architecture (selected papers).",
}

@InProceedings{Lenoski:1998:RDP,
  author =       "Daniel E. Lenoski and James P. Laudon",
  title =        "Retrospective: {The} {DASH} prototype: implementation
                 and performance",
  crossref =     "ACM:1998:PAI",
  pages =        "80--82",
  year =         "1998",
  bibdate =      "Fri May 12 17:56:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  remark =       "25 years of the International Symposia on Computer
                 Architecture (selected papers).",
}

@InProceedings{vonEicken:1998:RAM,
  author =       "Thorsten von Eicken and David E. Culler and Klaus Erik
                 Schauser and Seth Copen Goldstein",
  title =        "Retrospective: {Active} messages: a mechanism for
                 integrating computation and communication",
  crossref =     "ACM:1998:PAI",
  pages =        "83--84",
  year =         "1998",
  bibdate =      "Fri May 12 17:56:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  remark =       "25 years of the International Symposia on Computer
                 Architecture (selected papers).",
}

@InProceedings{Ni:1998:RTM,
  author =       "Lionel Ni",
  title =        "Retrospective: {The} turn model for adaptive routing",
  crossref =     "ACM:1998:PAI",
  pages =        "85--86",
  year =         "1998",
  bibdate =      "Fri May 12 17:56:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  remark =       "25 years of the International Symposia on Computer
                 Architecture (selected papers).",
}

@InProceedings{Yeh:1998:RAI,
  author =       "Tse-Yu Yeh and Yale N. Patt",
  title =        "Retrospective: {Alternative} implementations of
                 two-level adaptive training branch prediction",
  crossref =     "ACM:1998:PAI",
  pages =        "87--88",
  year =         "1998",
  bibdate =      "Fri May 12 17:56:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  remark =       "25 years of the International Symposia on Computer
                 Architecture (selected papers).",
}

@InProceedings{Veidenbaum:1998:RCS,
  author =       "A. Veidenbaum and P.-C. Yew and D. J. Kuck and C. D.
                 Polychronopoulos and D. H. Padua and E. S. Davidson and
                 K. Gallivan",
  title =        "Retrospective: {The} {Cedar} system",
  crossref =     "ACM:1998:PAI",
  pages =        "89--91",
  year =         "1998",
  bibdate =      "Fri May 12 17:56:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  remark =       "25 years of the International Symposia on Computer
                 Architecture (selected papers).",
}

@InProceedings{Blumrich:1998:RVM,
  author =       "Matthias A. Blumrich and Kai Li and Richard D. Alpert
                 and Cezary Dubnicki and Edward W. Felten and Jonathan
                 Sandberg",
  title =        "Retrospective: {Virtual} memory mapped network
                 interface for the {SHRIMP} multicomputer",
  crossref =     "ACM:1998:PAI",
  pages =        "92--94",
  year =         "1998",
  bibdate =      "Fri May 12 17:56:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  remark =       "25 years of the International Symposia on Computer
                 Architecture (selected papers).",
}

@InProceedings{Kuskin:1998:RSF,
  author =       "Jeffrey S. Kuskin",
  title =        "Retrospective: {The} {Stanford FLASH} multiprocessor",
  crossref =     "ACM:1998:PAI",
  pages =        "95--97",
  year =         "1998",
  bibdate =      "Fri May 12 17:56:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  remark =       "25 years of the International Symposia on Computer
                 Architecture (selected papers).",
}

@InProceedings{Reinhardt:1998:RTT,
  author =       "Steven K. Reinhardt and James R. Larus and David A.
                 Wood",
  title =        "Retrospective: {Tempest} and {Typhoon}: user-level
                 shared memory",
  crossref =     "ACM:1998:PAI",
  pages =        "98--102",
  year =         "1998",
  bibdate =      "Fri May 12 17:56:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  remark =       "25 years of the International Symposia on Computer
                 Architecture (selected papers).",
}

@InProceedings{Agarwal:1998:RAM,
  author =       "Anant Agarwal",
  title =        "Retrospective: {The} {MIT Alewife} machine:
                 architecture and performance",
  crossref =     "ACM:1998:PAI",
  pages =        "103--110",
  year =         "1998",
  bibdate =      "Fri May 12 17:56:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  remark =       "25 years of the International Symposia on Computer
                 Architecture (selected papers).",
}

@InProceedings{Sohi:1998:RMP,
  author =       "Gurindar Sohi",
  title =        "Retrospective: {Multiscalar} processors",
  crossref =     "ACM:1998:PAI",
  pages =        "111--114",
  year =         "1998",
  bibdate =      "Fri May 12 17:56:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  remark =       "25 years of the International Symposia on Computer
                 Architecture (selected papers).",
}

@InProceedings{Tullsen:1998:RSM,
  author =       "Dean M. Tullsen and Susan J. Eggers and Henry M.
                 Levy",
  title =        "Retrospective: {Simultaneous} multithreading:
                 maximizing on-chip parallelism",
  crossref =     "ACM:1998:PAI",
  pages =        "115--116",
  year =         "1998",
  bibdate =      "Fri May 12 17:56:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  remark =       "25 years of the International Symposia on Computer
                 Architecture (selected papers).",
}

@InProceedings{Goke:1998:BNP,
  author =       "L. Rodney Goke and G. J. Lipovski",
  title =        "{Banyan} networks for partitioning multiprocessor
                 systems",
  crossref =     "ACM:1998:PAI",
  pages =        "117--124",
  year =         "1998",
  bibdate =      "Fri May 12 17:56:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  remark =       "25 years of the International Symposia on Computer
                 Architecture (selected papers).",
}

@InProceedings{Dennis:1998:PAB,
  author =       "Jack B. Dennis and David P. Misunas",
  title =        "A preliminary architecture for a basic data-flow
                 processor",
  crossref =     "ACM:1998:PAI",
  pages =        "125--131",
  year =         "1998",
  bibdate =      "Fri May 12 17:56:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  remark =       "25 years of the International Symposia on Computer
                 Architecture (selected papers).",
}

@InProceedings{Patel:1998:ITP,
  author =       "Janak H. Patel and Edward S. Davidson",
  title =        "Improving the throughput of a pipeline by insertion of
                 delays",
  crossref =     "ACM:1998:PAI",
  pages =        "132--137",
  year =         "1998",
  bibdate =      "Fri May 12 17:56:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  remark =       "25 years of the International Symposia on Computer
                 Architecture (selected papers).",
}

@InProceedings{Bell:1998:CSW,
  author =       "Gordon Bell and William D. Strecker",
  title =        "Computer structures: what have we learned from the
                 {PDP-11}?",
  crossref =     "ACM:1998:PAI",
  pages =        "138--151",
  year =         "1998",
  bibdate =      "Fri May 12 17:56:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  remark =       "25 years of the International Symposia on Computer
                 Architecture (selected papers).",
}

@InProceedings{Peuto:1998:ITM,
  author =       "Bernard L. Peuto and Leonard J. Shustek",
  title =        "An instruction timing model of {CPU} performance",
  crossref =     "ACM:1998:PAI",
  pages =        "152--165",
  year =         "1998",
  bibdate =      "Fri May 12 17:56:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  remark =       "25 years of the International Symposia on Computer
                 Architecture (selected papers).",
}

@InProceedings{Ditzel:1998:RHL,
  author =       "David R. Ditzel and David A. Patterson",
  title =        "Retrospective on high-level language computer
                 architecture",
  crossref =     "ACM:1998:PAI",
  pages =        "166--173",
  year =         "1998",
  bibdate =      "Fri May 12 17:56:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  remark =       "25 years of the International Symposia on Computer
                 Architecture (selected papers).",
}

@InProceedings{Batcher:1998:AMP,
  author =       "Kenneth E. Batcher",
  title =        "Architecture of a massively parallel processor",
  crossref =     "ACM:1998:PAI",
  pages =        "174--179",
  year =         "1998",
  bibdate =      "Fri May 12 17:56:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  remark =       "25 years of the International Symposia on Computer
                 Architecture (selected papers).",
}

@InProceedings{Lampson:1998:PHP,
  author =       "Butler W. Lampson and Kenneth A. Pier",
  title =        "A processor for a high-performance personal computer",
  crossref =     "ACM:1998:PAI",
  pages =        "180--194",
  year =         "1998",
  bibdate =      "Fri May 12 17:56:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  remark =       "25 years of the International Symposia on Computer
                 Architecture (selected papers).",
}

@InProceedings{Kroft:1998:LFI,
  author =       "David Kroft",
  title =        "Lockup-free instruction fetch\slash prefetch cache
                 organization",
  crossref =     "ACM:1998:PAI",
  pages =        "195--201",
  year =         "1998",
  bibdate =      "Fri May 12 17:56:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  remark =       "25 years of the International Symposia on Computer
                 Architecture (selected papers).",
}

@InProceedings{Smith:1998:SBP,
  author =       "James E. Smith",
  title =        "A study of branch prediction strategies",
  crossref =     "ACM:1998:PAI",
  pages =        "202--215",
  year =         "1998",
  bibdate =      "Fri May 12 17:56:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  remark =       "25 years of the International Symposia on Computer
                 Architecture (selected papers).",
}

@InProceedings{Patterson:1998:RRI,
  author =       "David A. Patterson and Carlo H. Sequin",
  title =        "{RISC I}: a reduced instruction set {VLSI} computer",
  crossref =     "ACM:1998:PAI",
  pages =        "216--230",
  year =         "1998",
  bibdate =      "Fri May 12 17:56:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  remark =       "25 years of the International Symposia on Computer
                 Architecture (selected papers).",
}

@InProceedings{Smith:1998:DAE,
  author =       "James E. Smith",
  title =        "Decoupled access\slash execute computer
                 architectures",
  crossref =     "ACM:1998:PAI",
  pages =        "231--238",
  year =         "1998",
  bibdate =      "Fri May 12 17:56:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  remark =       "25 years of the International Symposia on Computer
                 Architecture (selected papers).",
}

@InProceedings{Gottlieb:1998:NUD,
  author =       "Allan Gottlieb and Ralph Grishman and Clyde P. Kruskal
                 and Kevin P. McAuliffe and Larry Rudolph and Marc
                 Snir",
  title =        "The {NYU Ultracomputer} --- designing a {MIMD},
                 shared-memory parallel machine",
  crossref =     "ACM:1998:PAI",
  pages =        "239--254",
  year =         "1998",
  bibdate =      "Fri May 12 17:56:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  remark =       "25 years of the International Symposia on Computer
                 Architecture (selected papers).",
}

@InProceedings{Goodman:1998:UCM,
  author =       "James R. Goodman",
  title =        "Using cache memory to reduce processor-memory
                 traffic",
  crossref =     "ACM:1998:PAI",
  pages =        "255--262",
  year =         "1998",
  bibdate =      "Fri May 12 17:56:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  remark =       "25 years of the International Symposia on Computer
                 Architecture (selected papers).",
}

@InProceedings{Fisher:1998:VLI,
  author =       "Joseph A. Fisher",
  title =        "Very long instruction word architectures and the
                 {ELI-512}",
  crossref =     "ACM:1998:PAI",
  pages =        "263--273",
  year =         "1998",
  bibdate =      "Fri May 12 17:56:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  remark =       "25 years of the International Symposia on Computer
                 Architecture (selected papers).",
}

@InProceedings{Emer:1998:CPP,
  author =       "Joel S. Emer and Douglas W. Clark",
  title =        "A characterization of processor performance in the
                 {VAX-11\slash 780}",
  crossref =     "ACM:1998:PAI",
  pages =        "274--283",
  year =         "1998",
  bibdate =      "Fri May 12 17:56:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  remark =       "25 years of the International Symposia on Computer
                 Architecture (selected papers).",
}

@InProceedings{Papamarcos:1998:LOC,
  author =       "Mark S. Papamarcos and Janak H. Patel",
  title =        "A low-overhead coherence solution for multiprocessors
                 with private cache memories",
  crossref =     "ACM:1998:PAI",
  pages =        "284--290",
  year =         "1998",
  bibdate =      "Fri May 12 17:56:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  remark =       "25 years of the International Symposia on Computer
                 Architecture (selected papers).",
}

@InProceedings{Smith:1998:IPI,
  author =       "James E. Smith and Andrew R. Pleszkun",
  title =        "Implementation of precise interrupts in pipelined
                 processors",
  crossref =     "ACM:1998:PAI",
  pages =        "291--299",
  year =         "1998",
  bibdate =      "Fri May 12 17:56:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  remark =       "25 years of the International Symposia on Computer
                 Architecture (selected papers).",
}

@InProceedings{Hwu:1998:HHP,
  author =       "Wen-Wei Hwu and Yale N. Patt",
  title =        "{HPSm}, a high performance restricted data flow
                 architecture having minimal functionality",
  crossref =     "ACM:1998:PAI",
  pages =        "300--308",
  year =         "1998",
  bibdate =      "Fri May 12 17:56:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  remark =       "25 years of the International Symposia on Computer
                 Architecture (selected papers).",
}

@InProceedings{Annaratone:1998:WAI,
  author =       "Marco Annaratone and Emmanuel Arnould and Thomas Gross
                 and H. T. Kung and Monica S. Lam and Onat
                 Menzilcio{\u{g}}lu and Ken Sarocky and Jon A. Webb",
  title =        "{Warp} architecture and implementation",
  crossref =     "ACM:1998:PAI",
  pages =        "309--319",
  year =         "1998",
  bibdate =      "Fri May 12 17:56:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  remark =       "25 years of the International Symposia on Computer
                 Architecture (selected papers).",
}

@InProceedings{Dubois:1998:MAB,
  author =       "Michel Dubois and Christoph Scheurich and Faye
                 Briggs",
  title =        "Memory access buffering in multiprocessors",
  crossref =     "ACM:1998:PAI",
  pages =        "320--328",
  year =         "1998",
  bibdate =      "Fri May 12 17:56:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  remark =       "25 years of the International Symposia on Computer
                 Architecture (selected papers).",
}

@InProceedings{Sohi:1998:IIL,
  author =       "Gurindar S. Sohi and Sriram Vajapeyam",
  title =        "Instruction issue logic for high-performance,
                 interruptible pipelined processors",
  crossref =     "ACM:1998:PAI",
  pages =        "329--336",
  year =         "1998",
  bibdate =      "Fri May 12 17:56:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  remark =       "25 years of the International Symposia on Computer
                 Architecture (selected papers).",
}

@InProceedings{Dally:1998:AMD,
  author =       "William J. Dally and Linda Chao and Andrew Chien and
                 Soha Hassoun and Waldemar Horwat and Jon Kaplan and
                 Paul Song and Brian Totty and Scott Wills",
  title =        "Architecture of a message-driven processor",
  crossref =     "ACM:1998:PAI",
  pages =        "337--344",
  year =         "1998",
  bibdate =      "Fri May 12 17:56:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  remark =       "25 years of the International Symposia on Computer
                 Architecture (selected papers).",
}

@InProceedings{Baer:1998:IPM,
  author =       "Jean-Loup Baer and Wen-Hann Wang",
  title =        "On the inclusion properties for multi-level cache
                 hierarchies",
  crossref =     "ACM:1998:PAI",
  pages =        "345--352",
  year =         "1998",
  bibdate =      "Fri May 12 17:56:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  remark =       "25 years of the International Symposia on Computer
                 Architecture (selected papers).",
}

@InProceedings{Agarwal:1998:EDS,
  author =       "Anant Agarwal and Richard Simoni and John Hennessy and
                 Mark Horowitz",
  title =        "An evaluation of directory schemes for cache
                 coherence",
  crossref =     "ACM:1998:PAI",
  pages =        "353--362",
  year =         "1998",
  bibdate =      "Fri May 12 17:56:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  remark =       "25 years of the International Symposia on Computer
                 Architecture (selected papers).",
}

@InProceedings{Adve:1998:WON,
  author =       "Sarita V. Adve and Mark D. Hill",
  title =        "Weak ordering --- a new definition",
  crossref =     "ACM:1998:PAI",
  pages =        "363--375",
  year =         "1998",
  bibdate =      "Fri May 12 17:56:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  remark =       "25 years of the International Symposia on Computer
                 Architecture (selected papers).",
}

@InProceedings{Gharachorloo:1998:MCE,
  author =       "Kourosh Gharachorloo and Daniel Lenoski and James
                 Laudon and Phillip Gibbons and Anoop Gupta and John
                 Hennessy",
  title =        "Memory consistency and event ordering in scalable
                 shared-memory multiprocessors",
  crossref =     "ACM:1998:PAI",
  pages =        "376--387",
  year =         "1998",
  bibdate =      "Fri May 12 17:56:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  remark =       "25 years of the International Symposia on Computer
                 Architecture (selected papers).",
}

@InProceedings{Jouppi:1998:IDM,
  author =       "Norman P. Jouppi",
  title =        "Improving direct-mapped cache performance by the
                 addition of a small fully-associative cache prefetch
                 buffers",
  crossref =     "ACM:1998:PAI",
  pages =        "388--397",
  year =         "1998",
  bibdate =      "Fri May 12 17:56:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  remark =       "25 years of the International Symposia on Computer
                 Architecture (selected papers).",
}

@InProceedings{Papadopoulos:1998:MET,
  author =       "Gregory M. Papadopoulos and David E. Culler",
  title =        "{Monsoon}: an explicit token-store architecture",
  crossref =     "ACM:1998:PAI",
  pages =        "398--407",
  year =         "1998",
  bibdate =      "Fri May 12 17:56:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  remark =       "25 years of the International Symposia on Computer
                 Architecture (selected papers).",
}

@InProceedings{Chang:1998:IAF,
  author =       "Pohua P. Chang and Scott A. Mahlke and William Y. Chen
                 and Nancy J. Warter and Wen-mei W. Hwu",
  title =        "{IMPACT}: an architectural framework for
                 multiple-instruction-issue processors",
  crossref =     "ACM:1998:PAI",
  pages =        "408--417",
  year =         "1998",
  bibdate =      "Fri May 12 17:56:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  remark =       "25 years of the International Symposia on Computer
                 Architecture (selected papers).",
}

@InProceedings{Lenoski:1998:DPI,
  author =       "Daniel Lenoski and James Laudon and Truman Joe and
                 David Nakahira and Luis Stevens and Anoop Gupta and
                 John Hennessy",
  title =        "The {DASH} prototype: implementation and performance",
  crossref =     "ACM:1998:PAI",
  pages =        "418--429",
  year =         "1998",
  bibdate =      "Fri May 12 17:56:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  remark =       "25 years of the International Symposia on Computer
                 Architecture (selected papers).",
}

@InProceedings{vonEicken:1998:AMM,
  author =       "Thorsten von Eicken and David E. Culler and Seth Copen
                 Goldstein and Klaus Erik Schauser",
  title =        "Active messages: a mechanism for integrating
                 communication and computation",
  crossref =     "ACM:1998:PAI",
  pages =        "430--440",
  year =         "1998",
  bibdate =      "Fri May 12 17:56:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  remark =       "25 years of the International Symposia on Computer
                 Architecture (selected papers).",
}

@InProceedings{Glass:1998:TMA,
  author =       "Christopher J. Glass and Lionel M. Ni",
  title =        "The turn model for adaptive routing",
  crossref =     "ACM:1998:PAI",
  pages =        "441--450",
  year =         "1998",
  bibdate =      "Fri May 12 17:56:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  remark =       "25 years of the International Symposia on Computer
                 Architecture (selected papers).",
}

@InProceedings{Yeh:1998:AIT,
  author =       "Tse-Yu Yeh and Yale N. Patt",
  title =        "Alternative implementations of two-level adaptive
                 branch prediction",
  crossref =     "ACM:1998:PAI",
  pages =        "451--461",
  year =         "1998",
  bibdate =      "Fri May 12 17:56:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  remark =       "25 years of the International Symposia on Computer
                 Architecture (selected papers).",
}

@InProceedings{Kuck:1998:CSI,
  author =       "D. Kuck and E. Davidson and D. Lawrie and A. Sameh and
                 C.-Q. Zhu and A. Veidenbaum and J. Konicek and P. Yew
                 and K. Gallivan and W. Jalby and H. Wijshoff and R.
                 Bramley and U. M. Yang and P. Emrath and D. Padua and
                 R. Eigenmann and J. Hoeflinger and G. Jayson and Z. Li
                 and T. Murphy and J. Andrews",
  title =        "The {Cedar} system and an initial performance study",
  crossref =     "ACM:1998:PAI",
  pages =        "462--472",
  year =         "1998",
  bibdate =      "Fri May 12 17:56:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  remark =       "25 years of the International Symposia on Computer
                 Architecture (selected papers).",
}

@InProceedings{Blumrich:1998:VMM,
  author =       "Matthias A. Blumrich and Kai Li and Richard Alpert and
                 Cezary Dubnicki and Edward W. Felten and Jonathan
                 Sandberg",
  title =        "Virtual memory mapped network interface for the
                 {SHRIMP} multicomputer",
  crossref =     "ACM:1998:PAI",
  pages =        "473--484",
  year =         "1998",
  bibdate =      "Fri May 12 17:56:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  remark =       "25 years of the International Symposia on Computer
                 Architecture (selected papers).",
}

@InProceedings{Kuskin:1998:SFM,
  author =       "Jeffrey Kuskin and David Ofelt and Mark Heinrich and
                 John Heinlein and Richard Simoni and K. Gharachorloo
                 and J. Chapin and D. Nakahira and J. Baxter and M.
                 Horowitz and A. Gupta and M. Rosenblum and J.
                 Hennessy",
  title =        "The {Stanford FLASH} multiprocessor",
  crossref =     "ACM:1998:PAI",
  pages =        "485--496",
  year =         "1998",
  bibdate =      "Fri May 12 17:56:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  remark =       "25 years of the International Symposia on Computer
                 Architecture (selected papers).",
}

@InProceedings{Reinhardt:1998:TTU,
  author =       "Steven K. Reinhardt and James R. Larus and David A.
                 Wood",
  title =        "{Tempest} and {Typhoon}: user-level shared memory",
  crossref =     "ACM:1998:PAI",
  pages =        "497--508",
  year =         "1998",
  bibdate =      "Fri May 12 17:56:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  remark =       "25 years of the International Symposia on Computer
                 Architecture (selected papers).",
}

@InProceedings{Agarwal:1998:AMA,
  author =       "Anant Agarwal and Ricardo Bianchini and David Chaiken
                 and Kirk L. Johnson and David Kranz and J. Kubiatowicz
                 and B.-H. Lim and K. Mackenzie and D. Yeung",
  title =        "The {MIT Alewife} machine: architecture and
                 performance",
  crossref =     "ACM:1998:PAI",
  pages =        "509--520",
  year =         "1998",
  bibdate =      "Fri May 12 17:56:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  remark =       "25 years of the International Symposia on Computer
                 Architecture (selected papers).",
}

@InProceedings{Sohi:1998:MP,
  author =       "Gurindar S. Sohi and Scott E. Breach and T. N.
                 Vijaykumar",
  title =        "Multiscalar processors",
  crossref =     "ACM:1998:PAI",
  pages =        "521--532",
  year =         "1998",
  bibdate =      "Fri May 12 17:56:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  remark =       "25 years of the International Symposia on Computer
                 Architecture (selected papers).",
}

@InProceedings{Tullsen:1998:SMM,
  author =       "Dean M. Tullsen and Susan J. Eggers and Henry M.
                 Levy",
  title =        "Simultaneous multithreading: maximizing on-chip
                 parallelism",
  crossref =     "ACM:1998:PAI",
  pages =        "533--544",
  year =         "1998",
  bibdate =      "Fri May 12 17:56:30 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  acknowledgement = ack-nhfb,
  remark =       "25 years of the International Symposia on Computer
                 Architecture (selected papers).",
}

%%% ====================================================================
%%% Cross-referenced entries must come last:

@Proceedings{Lipovski:1973:PFA,
  editor =       "G. Jack Lipovski and Stephen Anthony Szygenda",
  booktitle =    "{Proceedings of the First Annual Symposium on Computer
                 Architecture, December 9--11, 1973, University of
                 Florida, Gainesville, Florida}",
  title =        "{Proceedings of the First Annual Symposium on Computer
                 Architecture, December 9--11, 1973, University of
                 Florida, Gainesville, Florida}",
  volume =       "2(4)",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "iv + 277",
  year =         "1973",
  CODEN =        "CANED2, CPAADU",
  ISBN =         "????",
  ISBN-13 =      "????",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE), 0149-7111",
  LCCN =         "TK7885.A1",
  bibdate =      "Fri May 12 14:36:31 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "IEEE catalog no. 73CH0824-3C.",
  series =       j-COMP-ARCH-NEWS,
  URL =          "http://portal.acm.org/toc.cfm?id=800123",
  acknowledgement = ack-nhfb,
}

@Proceedings{King:1975:CPA,
  editor =       "Willis K. King",
  booktitle =    "{Conference Proceedings: 2nd Annual Symposium on
                 Computer Architecture, Houston, Texas, January 20--22,
                 1975}",
  title =        "{Conference Proceedings: 2nd Annual Symposium on
                 Computer Architecture, Houston, Texas, January 20--22,
                 1975}",
  volume =       "3(4)",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "vi + 231",
  year =         "1975",
  CODEN =        "CANED2, CPAADU",
  ISBN =         "????",
  ISBN-13 =      "????",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE), 0149-7111",
  LCCN =         "????",
  bibdate =      "Fri May 12 14:27:32 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  series =       j-COMP-ARCH-NEWS,
  URL =          "http://portal.acm.org/toc.cfm?id=642089",
  acknowledgement = ack-nhfb,
}

@Proceedings{IEEE:1976:CPA,
  editor =       "{IEEE}",
  booktitle =    "{Conference Proceedings: 3rd Annual Symposium on
                 Computer Architecture, Clearwater, Florida, January
                 19--21, 1976}",
  title =        "{Conference Proceedings: 3rd Annual Symposium on
                 Computer Architecture, Clearwater, Florida, January
                 19--21, 1976}",
  volume =       "??(??)",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "????",
  year =         "1976",
  CODEN =        "CANED2, CPAADU",
  ISBN =         "????",
  ISBN-13 =      "????",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE), 0149-7111",
  LCCN =         "????",
  bibdate =      "Fri May 12 14:20:44 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "IEEE no. 75CH1043-5C.",
  series =       j-COMP-ARCH-NEWS,
  URL =          "http://portal.acm.org/toc.cfm?id=800110",
  acknowledgement = ack-nhfb,
}

@Proceedings{IEEE:1977:CPA,
  editor =       "{IEEE}",
  booktitle =    "{Conference Proceedings: 4th Annual Symposium on
                 Computer Architecture, Silver Spring, Maryland, March
                 23--25, 1977}",
  title =        "{Conference Proceedings: 4th Annual Symposium on
                 Computer Architecture, Silver Spring, Maryland, March
                 23--25, 1977}",
  volume =       "??(??)",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "ix + 438",
  year =         "1977",
  CODEN =        "CANED2, CPAADU",
  ISBN =         "????",
  ISBN-13 =      "????",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE), 0149-7111",
  LCCN =         "QA76.9.A73 S97 1977",
  bibdate =      "Fri May 12 14:22:57 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "IEEE no. 77 CH1182-5C.",
  series =       j-COMP-ARCH-NEWS,
  URL =          "http://portal.acm.org/toc.cfm?id=800255",
  acknowledgement = ack-nhfb,
}

@Proceedings{IEEE:1979:CPA,
  editor =       "{IEEE}",
  booktitle =    "{Conference Proceedings: 5th Annual Symposium on
                 Computer Architecture, Palo Alto, California, April
                 23--25, 1979}",
  title =        "{Conference Proceedings: 5th Annual Symposium on
                 Computer Architecture, Palo Alto, California, April
                 23--25, 1979}",
  volume =       "6(7)",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "????",
  year =         "1979",
  CODEN =        "CANED2, CPAADU",
  ISBN =         "????",
  ISBN-13 =      "????",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE), 0149-7111",
  LCCN =         "????",
  bibdate =      "Fri May 12 14:22:57 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  series =       j-COMP-ARCH-NEWS,
  URL =          "http://portal.acm.org/toc.cfm?id=800094",
  acknowledgement = ack-nhfb,
}

@Proceedings{ACM:1980:CPA,
  editor =       "{ACM}",
  booktitle =    "{Conference Proceedings: 7th Annual Symposium on
                 Computer Architecture, La Baule, France, 6--8 May
                 1980}",
  title =        "{Conference Proceedings: 7th Annual Symposium on
                 Computer Architecture, La Baule, France, 6--8 May
                 1980}",
  volume =       "8(3)",
  publisher =    pub-ACM,
  address =      pub-ACM:adr,
  pages =        "333",
  year =         "1980",
  CODEN =        "CANED2, CPAADU",
  ISBN =         "????",
  ISBN-13 =      "????",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE), 0149-7111",
  bibdate =      "Fri Sep 16 10:53:10 1994",
  bibsource =    "ftp://ftp.math.utah.edu/pub/mirrors/ftp.ira.uka.de/bibliography/Math/fparith.bib;
                 http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  series =       j-COMP-ARCH-NEWS,
  URL =          "http://portal.acm.org/toc.cfm?id=800090",
  acknowledgement = ack-nj,
}

@Proceedings{IEEE:1981:CPA,
  editor =       "{IEEE}",
  booktitle =    "{Conference Proceedings: 8th Annual Symposium on
                 Computer Architecture, Minneapolis, Minnesota, May
                 12--14, 1981}",
  title =        "{Conference Proceedings: 8th Annual Symposium on
                 Computer Architecture, Minneapolis, Minnesota, May
                 12--14, 1981}",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "????",
  year =         "1981",
  CODEN =        "CANED2, CPAADU",
  ISBN =         "????",
  ISBN-13 =      "????",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE), 0149-7111",
  LCCN =         "????",
  bibdate =      "Fri May 12 14:25:51 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  series =       j-COMP-ARCH-NEWS,
  URL =          "http://portal.acm.org/toc.cfm?id=800052",
  acknowledgement = ack-nhfb,
}

@Proceedings{IEEE:1982:CPA,
  editor =       "{IEEE}",
  booktitle =    "{Conference proceedings: the 9th annual Symposium on
                 Computer Architecture: April 26--29, 1982, Austin,
                 Texas}",
  title =        "{Conference proceedings: the 9th annual Symposium on
                 Computer Architecture: April 26--29, 1982, Austin,
                 Texas}",
  volume =       "10(3)",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "viii + 335",
  year =         "1982",
  CODEN =        "CANED2, CPAADU",
  ISBN =         "????",
  ISBN-13 =      "????",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE), 0149-7111",
  LCCN =         "QA76.9.A73 S97 1982",
  bibdate =      "Fri May 12 14:17:17 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ACM order no. 415820. IEEE catalogue no. 82CH1754-1.
                 IEEE Computer Society order no. 411.",
  series =       j-COMP-ARCH-NEWS,
  URL =          "http://portal.acm.org/toc.cfm?id=800048",
  acknowledgement = ack-nhfb,
}

@Proceedings{IEEE:1983:CPA,
  editor =       "{IEEE}",
  booktitle =    "Conference proceedings: the 10th annual International
                 Symposium on Computer Architecture, Royal Institute of
                 Technology, Stockholm, Sweden",
  title =        "Conference proceedings: the 10th annual International
                 Symposium on Computer Architecture, Royal Institute of
                 Technology, Stockholm, Sweden",
  volume =       "11(3)",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "ix + 438",
  year =         "1983",
  CODEN =        "CANED2",
  ISBN =         "0-89791-101-6",
  ISBN-13 =      "978-0-89791-101-6",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  LCCN =         "QA76.9.A73 .S97 1983",
  bibdate =      "Fri May 12 13:53:44 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ACM order number 415830. IEEE catalog no. 83CH1889-5.
                 IEEE Computer Society order no. 473.",
  series =       j-COMP-ARCH-NEWS,
  URL =          "http://portal.acm.org/toc.cfm?id=800046",
  acknowledgement = ack-nhfb,
}

@Proceedings{IEEE:1984:AIS,
  editor =       "{IEEE}",
  booktitle =    "{The 11th Annual International Symposium on Computer
                 Architecture, June 5--7, 1984, Ann Arbor, Michigan
                 conference proceedings}",
  title =        "{The 11th Annual International Symposium on Computer
                 Architecture, June 5--7, 1984, Ann Arbor, Michigan
                 conference proceedings}",
  volume =       "12(3)",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "ix + 373",
  year =         "1984",
  CODEN =        "CANED2",
  ISBN =         "0-8186-0538-3 (paperback)",
  ISBN-13 =      "978-0-8186-0538-3 (paperback)",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  LCCN =         "QA76.9.A73 S97 1984",
  bibdate =      "Fri May 12 14:30:24 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ACM order no. 415840. IEEE catalog no. 84CH2051-1.
                 IEEE Computer Society no. 538.",
  series =       j-COMP-ARCH-NEWS,
  URL =          "http://portal.acm.org/toc.cfm?id=800015",
  acknowledgement = ack-nhfb,
}

@Proceedings{IEEE:1985:AIS,
  editor =       "{IEEE}",
  booktitle =    "{The 12th Annual International Symposium on Computer
                 Architecture, June 17--19, 1985, Boston, Massachusetts:
                 conference proceedings}",
  title =        "{The 12th Annual International Symposium on Computer
                 Architecture, June 17--19, 1985, Boston, Massachusetts:
                 conference proceedings}",
  volume =       "13(3)",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "xiv + 428",
  year =         "1985",
  CODEN =        "CANED2",
  ISBN =         "0-8186-0634-7",
  ISBN-13 =      "978-0-8186-0634-2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  LCCN =         "QA76.9.A73 C65",
  bibdate =      "Fri May 12 13:47:45 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ACM order no. 415850. IEEE catalog no. 85CH2144-4.
                 IEEE Computer Society order no. 634.",
  series =       j-COMP-ARCH-NEWS,
  URL =          "http://portal.acm.org/toc.cfm?id=327010",
  acknowledgement = ack-nhfb,
}

@Proceedings{IEEE:1986:CPT,
  editor =       "{IEEE}",
  booktitle =    "{Conference proceedings: the thirteenth annual
                 International symposium on computer Architecture, June
                 2--5, 1986, Tokyo, Japan}",
  title =        "{Conference proceedings: the thirteenth annual
                 International symposium on computer Architecture, June
                 2--5, 1986, Tokyo, Japan}",
  volume =       "14(2)",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "xiii + 454",
  year =         "1986",
  CODEN =        "CANED2",
  ISBN =         "0-8186-8719-3",
  ISBN-13 =      "978-0-8186-8719-8",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  LCCN =         "QA76.9.A73 I56 1986",
  bibdate =      "Fri May 12 13:51:08 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ACM order number 415860. IEEE catalogue number
                 86CH12291-3. IEEE Computer society order number 719.",
  series =       j-COMP-ARCH-NEWS,
  URL =          "http://portal.acm.org/toc.cfm?id=17407",
  acknowledgement = ack-nhfb,
}

@Proceedings{IEEE:1987:AIS,
  editor =       "{IEEE}",
  booktitle =    "{The 14th Annual International Symposium on Computer
                 Architecture, June 2--5, 1987, Pittsburgh,
                 Pennsylvania: Conference proceedings}",
  title =        "{The 14th Annual International Symposium on Computer
                 Architecture, June 2--5, 1987, Pittsburgh,
                 Pennsylvania: Conference proceedings}",
  volume =       "15(2)",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "xi + 321",
  year =         "1987",
  CODEN =        "CANED2",
  ISBN =         "0-8186-8776-2 (casebound), 0-8186-0776-9 (paperback),
                 0-8186-0776-9 (microfiche), 0-8186-4776-0 (casebound)",
  ISBN-13 =      "978-0-8186-8776-1 (casebound), 978-0-8186-0776-9
                 (paperback), 978-0-8186-0776-9 (microfiche),
                 978-0-8186-4776-5 (casebound)",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  LCCN =         "QA76.9.A73 I56 1987",
  bibdate =      "Fri May 12 14:07:52 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ACM Order No. 415870.",
  series =       j-COMP-ARCH-NEWS,
  URL =          "http://portal.acm.org/toc.cfm?id=30350",
  acknowledgement = ack-nhfb,
}

@Proceedings{IEEE:1988:AIS,
  editor =       "{IEEE}",
  booktitle =    "{The 15th Annual International Symposium on Computer
                 Architecture: Conference proceedings, May 30--June 2,
                 1988, Honolulu, Hawaii}",
  title =        "{The 15th Annual International Symposium on Computer
                 Architecture: Conference proceedings, May 30--June 2,
                 1988, Honolulu, Hawaii}",
  volume =       "16(2)",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "xi + 461",
  year =         "1988",
  CODEN =        "CANED2",
  ISBN =         "0-8186-0861-7 (paperback), 0-8186-4861-9 (microfiche),
                 0-8186-8861-0 (case)",
  ISBN-13 =      "978-0-8186-0861-2 (paperback), 978-0-8186-4861-8
                 (microfiche), 978-0-8186-8861-4 (case)",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  LCCN =         "QA76.9.A73 C65",
  bibdate =      "Fri May 12 14:09:39 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ACM order no. 415880.",
  series =       j-COMP-ARCH-NEWS,
  URL =          "http://portal.acm.org/toc.cfm?id=52400",
  acknowledgement = ack-nhfb,
}

@Proceedings{ACM:1989:PAI,
  editor =       "{ACM}",
  booktitle =    "{Proceedings of the 16th annual International
                 Symposium on Computer Architecture, May 28--June 1,
                 1989, Jerusalem, Israel}",
  title =        "{Proceedings of the 16th annual International
                 Symposium on Computer Architecture, May 28--June 1,
                 1989, Jerusalem, Israel}",
  volume =       "17(3)",
  publisher =    pub-ACM,
  address =      pub-ACM:adr,
  pages =        "xvii + 426",
  year =         "1989",
  CODEN =        "CANED2",
  ISBN =         "0-89791-319-1, 0-8186-5948-3 (microfiche),
                 0-8186-8948-X (casebound), 0-8186-1948-1 (paperback)",
  ISBN-13 =      "978-0-89791-319-5; 978-0-8186-5948-5 (microfiche);
                 978-0-8186-8948-2 (casebound); 978-0-8186-1948-9
                 (paperback)",
  ISSN =         "0163-5964; 0884-7495",
  LCCN =         "QA76.9.A73 C65",
  bibdate =      "Fri May 12 13:42:34 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ACM order number 415890. IEEE catalog number
                 89CH2705-2. IEEE Computer Society order number 1948.",
  series =       j-COMP-ARCH-NEWS,
  URL =          "http://portal.acm.org/toc.cfm?id=74925",
  acknowledgement = ack-nhfb,
  remark =       "ISCA '89 Proceedings",
}

@Proceedings{IEEE:1990:PAI,
  editor =       "{IEEE}",
  booktitle =    "{Proceedings: the 17th annual International Symposium
                 on Computer Architecture, May 28--31, 1990, Seattle,
                 Washington}",
  title =        "{Proceedings: the 17th annual International Symposium
                 on Computer Architecture, May 28--31, 1990, Seattle,
                 Washington}",
  volume =       "18(2)",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "xv + 378",
  year =         "1990",
  CODEN =        "CANED2",
  ISBN =         "0-8186-9047-X (casebound), 0-89791-366-3,
                 0-8186-2047-1 (paperback), 0-8186-6047-3 (microfiche)",
  ISBN-13 =      "978-0-8186-9047-1 (casebound), 978-0-89791-366-9,
                 978-0-8186-2047-8 (paperback), 978-0-8186-6047-4
                 (microfiche)",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  LCCN =         "QA76.9.A73 I56 1990",
  bibdate =      "Fri May 12 14:04:34 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ACM order no. 415900.",
  series =       j-COMP-ARCH-NEWS,
  URL =          "http://portal.acm.org/toc.cfm?id=325164",
  acknowledgement = ack-nhfb,
  remark =       "ISCA '17 Proceedings",
}

@Proceedings{ACM:1991:PIS,
  editor =       "{ACM}",
  booktitle =    "{Proceedings of the 18th International Symposium on
                 Computer Architecture: May 27--30, 1991, Toronto,
                 Canada}",
  title =        "{Proceedings of the 18th International Symposium on
                 Computer Architecture: May 27--30, 1991, Toronto,
                 Canada}",
  volume =       "19(3)",
  publisher =    pub-ACM,
  address =      pub-ACM:adr,
  pages =        "xv + 399",
  year =         "1991",
  CODEN =        "CANED2",
  ISBN =         "0-89791-394-9",
  ISBN-13 =      "978-0-89791-394-2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  LCCN =         "QA76.9 A73 I56 1991",
  bibsource =    "ftp://ftp.math.utah.edu/pub/mirrors/ftp.ira.uka.de/bibliography/Os/IMMD_IV.bib;
                 http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ACM order number 415910. IEEE catalog number
                 91CH2995-9. IEEE Computer Society order number 2146.",
  series =       j-COMP-ARCH-NEWS,
  URL =          "http://portal.acm.org/toc.cfm?id=115952",
  acknowledgement = ack-nhfb,
  remark =       "ISCA '18 Proceedings",
}

@Proceedings{IEEE:1992:PAI,
  editor =       "{IEEE}",
  booktitle =    "{Proceedings, the 19th annual International Symposium
                 on Computer Architecture: May 19--21, 1992, Gold Coast,
                 Queensland, Australia}",
  title =        "{Proceedings, the 19th annual International Symposium
                 on Computer Architecture: May 19--21, 1992, Gold Coast,
                 Queensland, Australia}",
  volume =       "20(2)",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "xvi + 439",
  year =         "1992",
  CODEN =        "CANED2",
  ISBN =         "0-89791-509-7 (soft cover), 0-8186-2940-1 (perfect
                 bound), 0-8186-2942-8 (casebound), 0-8186-2941-X
                 (microfiche)",
  ISBN-13 =      "978-0-89791-509-0 (soft cover), 978-0-8186-2940-2
                 (perfect bound), 978-0-8186-2942-6 (casebound),
                 978-0-8186-2941-9 (microfiche)",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  LCCN =         "QA76.9.A73 I56 1992",
  bibdate =      "Fri May 12 13:59:17 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ACM order number 415920. IEEE catalog number
                 92CH3156-7. IEEE Computer Society order number 2940.",
  series =       j-COMP-ARCH-NEWS,
  URL =          "http://portal.acm.org/toc.cfm?id=139669",
  acknowledgement = ack-nhfb,
  remark =       "ISCA '19 Proceedings",
}

@Proceedings{ACM:1993:AIS,
  editor =       "{ACM}",
  booktitle =    "{20th Annual International Symposium on Computer
                 Architecture ISCA '20, San Diego, CA, USA, May 16--19,
                 1993}",
  title =        "{20th Annual International Symposium on Computer
                 Architecture ISCA '20, San Diego, CA, USA, May 16--19,
                 1993}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "21(2)",
  publisher =    pub-ACM,
  address =      pub-ACM:adr,
  pages =        "xii + 361",
  month =        may,
  year =         "1993",
  CODEN =        "CANED2",
  ISBN =         "0-8186-3810-9 (paper), 0-8186-3811-7 (microfiche),
                 0-8186-3812-5 (case)",
  ISBN-13 =      "978-0-8186-3810-7 (paper), 978-0-8186-3811-4
                 (microfiche), 978-0-8186-3812-1 (case)",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  LCCN =         "QA76.9.A73 I58 1993",
  bibdate =      "Sat Sep 28 19:27:02 MDT 1996",
  bibsource =    "ftp://ftp.math.utah.edu/pub/tex/bib/mach.bib;
                 http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ACM order number 415930. IEEE catalog number
                 93CH3284-7. IEEE Computer Society Press order number
                 3810-02.",
  series =       j-COMP-ARCH-NEWS,
  URL =          "http://portal.acm.org/toc.cfm?id=165123",
  acknowledgement = ack-nhfb,
  confsponsor =  "IEEE; ACM",
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "https://dl.acm.org/loi/sigarch",
}

@Proceedings{IEEE:1994:PAI,
  editor =       "{IEEE}",
  booktitle =    "{Proceedings: the 21st Annual International Symposium
                 on Computer Architecture, April 18--21, 1994, Chicago,
                 Illinois}",
  title =        "{Proceedings: the 21st Annual International Symposium
                 on Computer Architecture, April 18--21, 1994, Chicago,
                 Illinois}",
  volume =       "22(2)",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "xii + 394",
  year =         "1994",
  CODEN =        "CANED2",
  ISBN =         "0-8186-5510-0 (paper), 0-8186-5511-9 (microfiche),
                 0-8186-5512-7 (casebound)",
  ISBN-13 =      "978-0-8186-5510-4 (paper), 978-0-8186-5511-1
                 (microfiche), 978-0-8186-5512-8 (casebound)",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  LCCN =         "QA76.9.A73 S97 1994",
  bibdate =      "Fri May 12 13:45:19 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  series =       j-COMP-ARCH-NEWS,
  URL =          "http://portal.acm.org/toc.cfm?id=191995",
  acknowledgement = ack-nhfb,
  remark =       "ISCA '21 Proceedings",
}

@Proceedings{ACM:1995:PAI,
  editor =       "{ACM}",
  booktitle =    "{Proceedings, the 22nd Annual International Symposium
                 on Computer Architecture: June 22--24, 1995, Santa
                 Margherita Ligure, Italy}",
  title =        "{Proceedings, the 22nd Annual International Symposium
                 on Computer Architecture: June 22--24, 1995, Santa
                 Margherita Ligure, Italy}",
  volume =       "23(2)",
  publisher =    pub-ACM,
  address =      pub-ACM:adr,
  pages =        "xiii + 426",
  year =         "1995",
  CODEN =        "CANED2",
  ISBN =         "0-89791-698-0",
  ISBN-13 =      "978-0-89791-698-1",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  LCCN =         "QA76.9.A73 I56 1995",
  bibdate =      "Fri May 12 13:37:23 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "ACM order number 415950. EEE catalog number 95CS35801.
                 IEEE Computer Society order number PRO7677.",
  series =       j-COMP-ARCH-NEWS,
  URL =          "http://portal.acm.org/toc.cfm?id=223982",
  acknowledgement = ack-nhfb,
  remark =       "ISCA '22",
}

@Proceedings{ACM:1996:PAI,
  editor =       "{ACM}",
  booktitle =    "{Proceedings: the 23rd Annual International Symposium
                 on Computer Architecture, May 22--24, 1996,
                 Philadelphia, Pennsylvania}",
  title =        "{Proceedings: the 23rd Annual International Symposium
                 on Computer Architecture, May 22--24, 1996,
                 Philadelphia, Pennsylvania}",
  volume =       "24(2)",
  publisher =    pub-ACM,
  address =      pub-ACM:adr,
  pages =        "xii + 318",
  year =         "1996",
  ISBN =         "0-89791-786-3",
  ISBN-13 =      "978-0-89791-786-5",
  LCCN =         "QA76.9.A73 S97 1996",
  bibdate =      "Fri May 12 12:36:04 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib;
                 z3950.bibsys.no:2100/BIBSYS",
  note =         "ACM order number 415960.",
  series =       "Computer architecture news",
  URL =          "http://portal.acm.org/toc.cfm?id=232973",
  acknowledgement = ack-nhfb,
  remark =       "ISCA '23 proceedings; FCRC '96.",
}

@Proceedings{ACM:1997:AIS,
  editor =       "{ACM}",
  booktitle =    "{The 24th Annual International Symposium on Computer
                 Architecture, June 2--4, 1997, Denver, Colorado:
                 conference proceedings}",
  title =        "{The 24th Annual International Symposium on Computer
                 Architecture, June 2--4, 1997, Denver, Colorado:
                 conference proceedings}",
  volume =       "25(2)",
  publisher =    pub-ACM,
  address =      pub-ACM:adr,
  pages =        "vii + 350",
  year =         "1997",
  CODEN =        "CANED2",
  ISBN =         "0-89791-901-7",
  ISBN-13 =      "978-0-89791-901-2",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  LCCN =         "QA76.9.A73 S94 1997",
  bibdate =      "Fri May 12 12:36:26 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib;
                 z3950.bibsys.no:2100/BIBSYS",
  note =         "ACM order number 415974.",
  series =       j-COMP-ARCH-NEWS,
  URL =          "http://portal.acm.org/toc.cfm?id=264107",
  acknowledgement = ack-nhfb,
}

@Proceedings{ACM:1998:PAI,
  editor =       "{ACM}",
  booktitle =    "{Proceedings: the 25th Annual International Symposium
                 on Computer Architecture, June 27--July 1, 1998,
                 Barcelona, Spain}",
  title =        "{Proceedings: the 25th Annual International Symposium
                 on Computer Architecture, June 27--July 1, 1998,
                 Barcelona, Spain}",
  volume =       "26(3)",
  publisher =    pub-ACM,
  address =      pub-ACM:adr,
  pages =        "xiii + 394",
  year =         "1998",
  ISBN =         "0-8186-8491-7, 0-8186-8492-5, 0-8186-8493-3",
  ISBN-13 =      "978-0-8186-8491-3, 978-0-8186-8492-0,
                 978-0-8186-8493-7",
  LCCN =         "QA76.9.A73 S97 1998",
  bibdate =      "Fri May 12 12:36:10 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib;
                 z3950.bibsys.no:2100/BIBSYS",
  note =         "ACM Order Number 414984. IEEE Computer Society Order
                 Number PR08491; IEEE Order Plan Catalog Number
                 98CB36235.",
  series =       "Computer architecture news",
  URL =          "http://portal.acm.org/toc.cfm?id=279358;
                 http://portal.acm.org/toc.cfm?id=285930",
  acknowledgement = ack-nhfb,
  remark =       "ISCA '25 proceedings.",
}

@Proceedings{IEEE:1999:PIS,
  editor =       "{IEEE}",
  booktitle =    "{Proceedings of the 26th International Symposium on
                 Computer Architecture: May 2--4, 1999, Atlanta,
                 Georgia}",
  title =        "{Proceedings of the 26th International Symposium on
                 Computer Architecture: May 2--4, 1999, Atlanta,
                 Georgia}",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "xii + 317",
  year =         "1999",
  CODEN =        "CANED2",
  ISBN =         "0-7695-0170-2, 0-7695-0171-0 (casebound)",
  ISBN-13 =      "978-0-7695-0170-3, 978-0-7695-0171-0 (casebound)",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  LCCN =         "QA76.9.A73. S9 1999",
  bibdate =      "Fri May 12 13:33:37 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "IEEE Computer Society Order Number PR00170. IEEE Order
                 Plan Catalog Number 98CB36367.",
  series =       j-COMP-ARCH-NEWS,
  URL =          "http://portal.acm.org/toc.cfm?id=300979",
  acknowledgement = ack-nhfb,
  remark =       "ISCA '99 proceedings",
}

@Proceedings{ACM:2000:PIS,
  editor =       "{ACM}",
  booktitle =    "{Proceedings of the 27th International Symposium on
                 Computer Architecture, June 12--14, 2000, Vancouver,
                 British Columbia, Canada}",
  title =        "{Proceedings of the 27th International Symposium on
                 Computer Architecture, June 12--14, 2000, Vancouver,
                 British Columbia, Canada}",
  volume =       "28(2)",
  publisher =    pub-ACM,
  address =      pub-ACM:adr,
  pages =        "vi + 327",
  year =         "2000",
  ISBN =         "1-58113-232-8",
  ISBN-13 =      "978-1-58113-232-8",
  LCCN =         "QA76.9.A73 S97 2000",
  bibdate =      "Fri May 12 12:35:59 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib;
                 z3950.bibsys.no:2100/BIBSYS",
  series =       "Computer architecture news",
  URL =          "http://portal.acm.org/toc.cfm?id=339647",
  acknowledgement = ack-nhfb,
  remark =       "ISCA '27 proceedings.",
}

@Book{Hill:2000:RCA,
  editor =       "Mark D. (Mark Donald) Hill and Norman P. (Norman Paul)
                 Jouppi and Gurindar Sohi",
  booktitle =    "Readings in Computer Architecture",
  title =        "Readings in Computer Architecture",
  publisher =    pub-MORGAN-KAUFMANN,
  address =      pub-MORGAN-KAUFMANN:adrsf,
  pages =        "xviii + 717",
  year =         "2000",
  ISBN =         "1-55860-539-8",
  ISBN-13 =      "978-1-55860-539-8",
  LCCN =         "QA76.9.A73 H55 2000",
  bibdate =      "Fri May 12 15:34:46 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/bibnet/authors/w/wilkes-maurice-v.bib;
                 https://www.math.utah.edu/pub/tex/bib/sigarch.bib;
                 z3950.loc.gov:7090/Voyager",
  URL =          "http://books.elsevier.com/bookscat/links/details.asp?isbn=1558605398;
                 http://www.loc.gov/catdir/description/els033/99044480.html;
                 http://www.loc.gov/catdir/toc/els033/99044480.html;
                 https://archive.org/details/readingsincomput0000hill/page/n9/mode/2up?q=Slave+memories;
                 https://shop.elsevier.com/books/readings-in-computer-architecture/hill/978-0-08-057364-9",
  acknowledgement = ack-nhfb,
  shorttableofcontents = "1: Classic Machines: Technology,
                 Implementation, and Economics \\
                 2: Methods \\
                 3: Instruction Sets \\
                 4: Instruction Level Parallelism (ILP) \\
                 5: Dataflow and Multithreading \\
                 6: Memory Systems \\
                 7: I/O: Storage Systems, Networks, and Graphics \\
                 8: Single-Instruction Multiple Data (SIMD) Parallelism
                 \\
                 9: Multiprocessors and Multicomputers \\
                 10: Recent Implementations and Future Prospects",
  subject =      "Computer architecture",
  tableofcontents = "PREFACE \\
                 CHAPTER 1: Classic Machines: Technology,
                 Implementation, and Economics \\
                 G. M. Amdahl, G. A. Blaauw, F. P. Brooks, Jr.,
                 ``Architecture of the IBM System/360,'' IBM Journal of
                 Research and Development, April 1964 \\
                 J. E. Thornton, ``Parallel Operation in the Control
                 Data 6600,'' Fall Joint Computers Conference, vol. 26,
                 pp. 33--40, 1961 \\
                 R. M. Russell, ``The Cray-1 Computer System'', Comm.
                 ACM, 21, 1 (January 1978), 63--72 \\
                 J. Kolodzey, ``Cray-1 Computer Technology'', IEEE
                 Transactions on Components, Hybrids, and Manufacturing
                 Technology, p181--187, June 1981 \\
                 G. Moore, ``Cramming More Components onto Integrated
                 Circuits'', Electronics, p114--117, April 1965 \\
                 S. Mazor, ``The History of the Microcomputer Invention
                 and Evolution'', Proc. IEEE Dec '95, 1601--1607 \\
                 CHAPTER 2: Methods \\
                 G. M. Amdahl, ``Validity of the Single-Processor
                 Approach to Achieving Large Scale Computing
                 Capabilities'', AFIPS Conference Proceedings, (April
                 1967), 483--485 \\
                 M. D. Hill and A. J. Smith, ``Evaluating Associativity
                 in CPU Caches'', IEEE Trans. on Computers, C-38, 12
                 (December 1989), 1612--1630 \\
                 J. S. Emer and D. W. Clark, ``A Characterization of
                 Processor Performance in the VAX-11/780'', Proc.
                 Eleventh International Symposium on Computer
                 Architecture, Ann Arbor, MI (June 1984), 301--310 \\
                 CHAPTER 3: Instruction Sets \\
                 W. A. Wulf, ``Compilers and Computer Architecture'',
                 IEEE Computer, 14, 7 (July 1981), 41--48 \\
                 G. Radin, ``The 801 Minicomputer,'' Proc. Symposium on
                 Architectural Support for Programming Languages and
                 Operating Systems, March 1982, 39--47 \\
                 D. A. Patterson and D. R. Ditzel, ``The Case for the
                 Reduced Instruction Set Computer,'' ACM Computer
                 Architecture News, 8, 6, 15 October 1980, 25--33 \\
                 R. P. Colwell, C. Y. Hitchcock, E. D. Jensen, H. M.
                 Brinkley Sprunt, C. P. Kollar, ``Computers, Complexity,
                 and Controversy,'' IEEE Computer, vol. 18, no. 9,
                 September 1985 \\
                 J. Crawford, ``Architecture of the Intel 80386,''
                 Proceedings of ICCD , pp. 155--160, October 1986 \\
                 S. Mahlke, R. Hank, J. Mccormick, D. August, W. Hwu,
                 ``A Comparison of Full and Partial Predicated Execution
                 Support for ILP Processors'', Proc. 22nd Annual
                 Symposium on Computer Architecture (June 1995),
                 138--150 \\
                 CHAPTER 4: Instruction Level Parallelism (ILP) \\
                 D. W. Anderson, F. J. Sparacio and R. M. Tomasulo,
                 ``The IBM System/360 Model 91: Machine Philosophy and
                 Instruction-Handling'', IBM Journal of Research and
                 Development January 1967 \\
                 J. E. Smith and A. R. Pleszkun, ``Implementing Precise
                 Interrupts in Pipelined Processors'', IEEE Trans. on
                 Computers, C-37, 5 (May 1988), 562--573 \\
                 J. E. Smith, ``A Study of Branch Prediction
                 Strategies'', Proc. Eighth Annual Symposium on Computer
                 Architecture (May 1981), 135--148 \\
                 T.-Y. Yeh and Y. N. Patt, ``Two-Level Adaptive Branch
                 Prediction,'' Proc. 24th Annual Workshop on
                 Microprogramming (MICRO-24), Albuquerque, NM, (December
                 1991) \\
                 Y. N. Patt, W. W. Hwu and M. Shebanow, ``HPS, A New
                 Microarchitecture: Introduction and Rationale,'' Proc.
                 18th Annual Workshop on Microprogramming, Pacific
                 Grove, CA (December 1985), 103--108 \\
                 G. S. Sohi and S. Vajapeyam, ``Instruction Issue Logic
                 for High-Performance, Interruptible Pipelined
                 Processors'', Proc. 14th Annual Symposium on Computer
                 Architecture (June 1987), 27--34 \\
                 G. F. Grohoski, ``Machine Organization of the IBM RISC
                 System/6000 processor,'' IBM Journal of Research and
                 Development, 34, 1 (January 1990), 37--58 \\
                 K. C. Yeager, ``The MIPS R10000 Superscalar
                 Microprocessor'', IEEE Micro, 16, 2, April 1996, 28--40
                 \\
                 B. R. Rau and J. A. Fisher, ``Instruction-Level
                 Parallel Processing: History, Overview, and
                 Perspective'', The Journal of Supercomputing,, 7, 1,
                 (??? 1993), 9--50. Reprinted in Rau and Fisher (ed.),
                 ``Instruction-Level Parallelism, Kluwer Academic
                 Publishers, 1993 \\
                 CHAPTER 5: Dataflow and Multithreading \\
                 J. B. Dennis and D. P. Misunas, ``A Preliminary
                 Architecture for a Basic Data-Flow Processor,'' Proc.
                 2nd Annual Symposium on Computer Architecture, Computer
                 Architecture News, 3, 4 (December 1974), 126--132, ACM
                 \\
                 Arvind and R. S. Nikhil, ``Executing a Program on the
                 MIT Tagged-Token Dataflow Architecture'', IEEE Trans.
                 on Computers, 39, 3 (March 1990), 300--318 \\
                 B. Smith, ``Architecture and Applications of the HEP
                 Multiprocessor Computer System'', Proc. of the Int.
                 Soc. for Opt. Engr. (1981), 241--248 \\
                 D. M. Tullsen, S. J. Eggers, J. S. Emer, H. M. Levy, J.
                 L. Lo and R. L. Stamm, ``Exploiting Choice: Instruction
                 Fetch and Issue on an Implementable Simultaneous
                 Multithreading Processor'', Proc. 23rd Annual Symposium
                 on Computer Architecture (May 1996), 191--202 \\
                 CHAPTER 6: Memory Systems \\
                 M. V. Wilkes, ``Slave Memories and Dynamic Storage
                 Allocation'', IEEE Trans. on Electronic Computers,
                 EC-14, 2 (April 1965), 270--271 \\
                 J. S. Liptay, ``Structural Aspects of the System/360
                 Model 85, Part II: The Cache'', IBM Systems Journal,,
                 7, 1 (1968), 15--21 \\
                 D. Kroft, ``Lockup-Free Instruction Fetch/Prefetch
                 Cache Organization'', Proc. Eighth Symposium on
                 Computer Architecture (May 1981), 81--87 \\
                 J. R. Goodman, ``Using Cache Memory to Reduce
                 Processor-Memory Traffic'', Proc. Tenth International
                 Symposium on Computer Architecture, Stockholm, Sweden
                 (June 1983), 124--131 \\
                 N. P. Jouppi, ``Improving Direct-Mapped Cache
                 Performance by the Addition of a Small
                 Fully-Associative Cache and Prefetch Buffers'', Proc.
                 17th Annual Symposium on Computer Architecture,
                 Computer Architecture News, 18, 2 (June 1990),
                 364--373, ACM \\
                 T. Kilburn, D. B. G. Edwards, M. J. Lanigan, F. H.
                 Sumner, ``One-Level Storage System'', IRE Transactions,
                 EC-11, 2, (April 1962), 223--235 \\
                 D. W. Clark and J. S. Emer, ``Performance of the
                 VAX-11/780 Translation Buffer: Simulation and
                 Measurement'', ACM Trans. on Computer Systems, 3, 1
                 (February 1985), 31--62 \\
                 W. Wang, J.-L. Baer and H. M. Levy, ``Organization and
                 Performance of a Two-Level Virtual-Real Cache
                 Hierarchy'', Proc. 16th Annual International Symposium
                 on Computer Architecture, Jerusalem (June 1989),
                 140--148 \\
                 CHAPTER 7: I/O: Storage Systems, Networks, and Graphics
                 \\
                 M. Smotherman, ``A Sequencing-based Taxonomy of I/O
                 Systems and Review of Historical Machines'', ACM
                 Computer Architecture News 17:5, (September 1989), pgs
                 5--15. Storage Systems \\
                 C. Ruemmler and J. Wilkes, ``An Introduction to Disk
                 Drive Modeling'', IEEE Computer vol 27 #3, March 1994,
                 pgs 17--28 \\
                 D. A. Patterson, G. Gibson and R. H. Katz, ``A Case for
                 Redundant Arrays of Inexpensive Disks (RAID)'', Proc.
                 ACM SIGMOD Conference, Chicago, Illinois (June 1988).
                 Networks \\
                 R. Metcalfe and D. Boggs, ``Ethernet: Distributed
                 Packet Switching for Local Computer Networks.''
                 Communications of the ACM, 19(7):395--404 \\
                 L. Ni and P. McKinley, ``A Survey of Wormhole Routing
                 Techniques in Direct Networks'', IEEE Computer,
                 February 1993, vol 26 #2, pgs 62--76. Graphics \\
                 K. AKERLY, ``Reality Engine Graphics'', SIGGRAPH '93
                 Proceedings, pp 109--116 \\
                 CHAPTER 8: Single-Instruction Multiple Data (SIMD)
                 Parallelism \\
                 M. J. Flynn, ``Very High-Speed Computing Systems'',
                 Proceedings of the IEEE , vol. 54, no. 12, December
                 1966 \\
                 D. J. Kuck and R. A. Stokes, ``The Burroughs Scientific
                 Processor (BSP)'', IEEE Trans. on Computers , vol.
                 C-31, pp. 363--376, May 1982 \\
                 M. Gokhale, B. Holmes, K. Iobst, ``Processing in
                 Memory: The Terasys Massively Parallel PIM Array'',
                 IEEE Computer, 28, 4 (April 1995), 23--31 \\
                 CHAPTER 9: Multiprocessors and Multicomputers \\
                 W. A. Wulf and S. P. Harbison, ``Reflections in a pool
                 of processors / An experience report on C.mmp/Hydra'',
                 Proc. National Computer Conference (AFIPS) (June 1978)
                 \\
                 L. Lamport, ``How to Make a Multiprocessor Computer
                 That Correctly Executes Multiprocess Programs'', IEEE
                 Trans. on Computers, C-28, 9 (September 1979), 690--691
                 \\
                 L. M. Censier and P. Feautrier, ``A New Solution to
                 Coherence Problems in Multicache Systems'', IEEE
                 Transactions on Computers, C-27, 12 (December 1978),
                 1112--1118 \\
                 D. Lenoski, J. Laudon, K. Gharachorloo, W. Weber, A.
                 Gupta, J. Hennessy, M. Horowitz and M. Lam, ``The
                 Stanford DASH Multiprocessor'', IEEE Computer, 25, 3
                 (March 1992), 63--79 \\
                 E. Hagersten, A. Landin, and S. Haridi, ``DDM--A
                 Cache-Only Memory Architecture'', IEEE Computer, 25, 9
                 (September 1992), 44--54 \\
                 C. L. Seitz, ``The Cosmic Cube'', Comm. ACM (January
                 1985), 22--33 \\
                 K. Li and P. Hudak, ``Memory Coherence in Shared
                 Virtual Memory Systems'', ACM Trans. on Computer
                 Systems, 7, 4 (November 1989), 321--359 \\
                 CHAPTER 10: Recent Implementations and Future Prospects
                 \\
                 D. Alpert, D. Avnon, ``Architecture of the Pentium
                 Microprocessor'', IEEE Micro, June '93, 11--21 \\
                 D. Papworth, ``Tuning the Pentium Pro Micro
                 Architecture'', IEEE Micro April '96, 8--15 \\
                 M. Slater, ``The Microprocessor Today'', IEEE Micro Dec
                 '96, 32--44 \\
                 A. Yu, ``The Future of Microprocessors'', IEEE Micro
                 Dec '96, 46--53.",
}

@Proceedings{ACM:2001:PIS,
  editor =       "{ACM}",
  booktitle =    "{Proceedings of the 28th International Symposium on
                 Computer Architecture, June 30--July 4, 2001,
                 G{\"o}teborg, Sweden}",
  title =        "{Proceedings of the 28th International Symposium on
                 Computer Architecture, June 30--July 4, 2001,
                 G{\"o}teborg, Sweden}",
  volume =       "29(2)",
  publisher =    pub-ACM,
  address =      pub-ACM:adr,
  pages =        "xi + 289",
  year =         "2001",
  ISBN =         "0-7695-1162-7, 0-7695-1163-5, 0-7695-1164-3",
  ISBN-13 =      "978-0-7695-1162-7, 978-0-7695-1163-4,
                 978-0-7695-1164-1",
  LCCN =         "QA76.9.A73 C64 2001",
  bibdate =      "Fri May 12 12:36:32 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib;
                 z3950.bibsys.no:2100/BIBSYS",
  series =       "Computer architecture news",
  URL =          "http://portal.acm.org/toc.cfm?id=379240",
  acknowledgement = ack-nhfb,
  remark =       "ISCA '01 proceedings.",
}

@Proceedings{ACM:2002:PIS,
  editor =       "{ACM}",
  booktitle =    "{Proceedings of the 29th International Symposium on
                 Computer Architecture, May 25--29, 2002, Anchorage,
                 Alaska}",
  title =        "{Proceedings of the 29th International Symposium on
                 Computer Architecture, May 25--29, 2002, Anchorage,
                 Alaska}",
  volume =       "30(2)",
  publisher =    pub-ACM,
  address =      pub-ACM:adr,
  pages =        "xv + 331",
  year =         "2002",
  ISBN =         "0-7695-1605-X, 0-7695-1606-8, 0-7695-1607-6",
  ISBN-13 =      "978-0-7695-1605-9, 978-0-7695-1606-6,
                 978-0-7695-1607-3",
  LCCN =         "QA76.9.A73 S97 2002",
  bibdate =      "Fri May 12 12:36:48 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib;
                 z3950.bibsys.no:2100/BIBSYS",
  series =       "Computer architecture news",
  URL =          "http://portal.acm.org/toc.cfm?id=545215",
  acknowledgement = ack-nhfb,
  remark =       "ISCA '02 proceedings.",
}

@Proceedings{IEEE:2003:PAI,
  editor =       "{IEEE}",
  booktitle =    "{Proceedings: 30th Annual International Symposium on
                 Computer Architecture: San Diego, California, USA, June
                 9--11, 2003: ISCA '03}",
  title =        "{Proceedings: 30th Annual International Symposium on
                 Computer Architecture: San Diego, California, USA, June
                 9--11, 2003: ISCA '03}",
  volume =       "31(2)",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "xi + 448",
  year =         "2003",
  CODEN =        "CANED2",
  ISBN =         "0-7695-1945-8",
  ISBN-13 =      "978-0-7695-1945-6",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  LCCN =         "QA76 .S93 2002",
  bibdate =      "Fri May 12 12:35:09 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  series =       j-COMP-ARCH-NEWS,
  URL =          "http://portal.acm.org/toc.cfm?id=859618",
  acknowledgement = ack-nhfb,
}

@Proceedings{ACM:2004:PAI,
  editor =       "{ACM}",
  booktitle =    "{Proceedings: 31st Annual International Symposium on
                 Computer Architecture: ISCA 2004: [June 19--23, 2004,
                 M{\"u}nchen, Germany]}",
  title =        "{Proceedings: 31st Annual International Symposium on
                 Computer Architecture: ISCA 2004: [June 19--23, 2004,
                 M{\"u}nchen, Germany]}",
  volume =       "32(2)",
  publisher =    pub-ACM,
  address =      pub-ACM:adr,
  pages =        "xiv + 388",
  year =         "2004",
  CODEN =        "CANED2",
  ISBN =         "0-7695-2143-6",
  ISBN-13 =      "978-0-7695-2143-5",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  LCCN =         "QA76.5 .S84 2004",
  bibdate =      "Fri May 12 12:32:28 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "Includes CD-ROM.",
  series =       j-COMP-ARCH-NEWS,
  URL =          "http://portal.acm.org/toc.cfm?id=998680",
  acknowledgement = ack-nhfb,
  remark =       "ISCA '05 Proceedings",
}

@Proceedings{IEEE:2005:ISC,
  editor =       "{IEEE}",
  booktitle =    "{32nd International Symposium on Computer
                 Architecture: proceedings, Madison, Wisconsin, June
                 4--8, 2005}",
  title =        "{32nd International Symposium on Computer
                 Architecture: proceedings, Madison, Wisconsin, June
                 4--8, 2005}",
  volume =       "33(2)",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "xviii + 557",
  year =         "2005",
  CODEN =        "CANED2",
  ISBN =         "????",
  ISBN-13 =      "????",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  LCCN =         "????",
  bibdate =      "Fri May 12 13:31:22 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  note =         "Includes CD-ROM.",
  series =       j-COMP-ARCH-NEWS,
  URL =          "http://portal.acm.org/toc.cfm?id=1069807",
  acknowledgement = ack-nhfb,
  remark =       "ISCA '05 Proceedings",
}

@Proceedings{IEEE:2006:ISC,
  editor =       "{IEEE}",
  booktitle =    "{33rd International Symposium on Computer
                 Architecture: proceedings, Boston, MA, USA, June
                 17--21, 2006}",
  title =        "{33rd International Symposium on Computer
                 Architecture: proceedings, Boston, MA, USA, June
                 17--21, 2006}",
  volume =       "??(??)",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "????",
  year =         "2006",
  CODEN =        "CANED2",
  ISBN =         "????",
  ISBN-13 =      "????",
  ISSN =         "0163-5964 (ACM), 0884-7495 (IEEE)",
  ISSN-L =       "0163-5964",
  LCCN =         "????",
  bibdate =      "Fri May 12 13:31:22 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  series =       j-COMP-ARCH-NEWS,
  URL =          "http://www.ece.neu.edu/conf/isca2006/",
  acknowledgement = ack-nhfb,
  remark =       "ISCA 33 Proceedings",
}