Valid HTML 4.0! Valid CSS!
%%% -*-BibTeX-*-
%%% ====================================================================
%%%  BibTeX-file{
%%%     author          = "Nelson H. F. Beebe",
%%%     version         = "1.65",
%%%     date            = "27 August 2024",
%%%     time            = "10:56:16 MDT",
%%%     filename        = "jetc.bib",
%%%     address         = "University of Utah
%%%                        Department of Mathematics, 110 LCB
%%%                        155 S 1400 E RM 233
%%%                        Salt Lake City, UT 84112-0090
%%%                        USA",
%%%     telephone       = "+1 801 581 5254",
%%%     FAX             = "+1 801 581 4148",
%%%     URL             = "https://www.math.utah.edu/~beebe",
%%%     checksum        = "41997 27255 142980 1364781",
%%%     email           = "beebe at math.utah.edu, beebe at acm.org,
%%%                        beebe at computer.org (Internet)",
%%%     codetable       = "ISO/ASCII",
%%%     keywords        = "ACM Journal on Emerging Technologies in
%%%                        Computing Systems (JETC); bibliography;
%%%                        BibTeX",
%%%     license         = "public domain",
%%%     supported       = "no",
%%%     docstring       = "This is a COMPLETE BibTeX bibliography for
%%%                        the journal ACM Journal on Emerging
%%%                        Technologies in Computing Systems (JETC)
%%%                        (CODEN unknown, ISSN: 1550-4832 (print),
%%%                        1550-4840 (electronic)), for 2005--date.
%%%
%%%                        Publication began with volume 1, number 1,
%%%                        in March 2005.  The journal appears
%%%                        quarterly.
%%%
%%%                        The journal has a World-Wide Web site at:
%%%
%%%                            http://www.acm.org/pubs/jetc
%%%
%%%                        Tables-of-contents of all issues are
%%%                        available at:
%%%
%%%                            http://www.acm.org/pubs/contents/journals/jetc/
%%%                            http://portal.acm.org/browse_dl.cfm?idx=J967
%%%
%%%                        Qualified subscribers can retrieve the full
%%%                        text of recent articles in PDF form.
%%%
%%%                        At version 1.65, the COMPLETE journal
%%%                        coverage looked like this:
%%%
%%%                             2005 (   7)    2012 (  34)    2019 (  39)
%%%                             2006 (  11)    2013 (  30)    2020 (  44)
%%%                             2007 (  15)    2014 (  57)    2021 (  63)
%%%                             2008 (  24)    2015 (  42)    2022 (  81)
%%%                             2009 (  19)    2016 (  27)    2023 (  34)
%%%                             2010 (  15)    2017 (  50)    2024 (  11)
%%%                             2011 (  20)    2018 (  48)
%%%
%%%                             Article:        671
%%%
%%%                             Total entries:  671
%%%
%%%                        Data for this bibliography was derived from
%%%                        data at the ACM Web site.
%%%
%%%                        ACM copyrights explicitly permit abstracting
%%%                        with credit, so article abstracts, keywords,
%%%                        and subject classifications have been
%%%                        included in this bibliography wherever
%%%                        available.
%%%
%%%                        The bibsource keys in the bibliography
%%%                        entries below indicate the data sources.
%%%
%%%                        URL keys in the bibliography point to
%%%                        World Wide Web locations of additional
%%%                        information about the entry.
%%%
%%%                        Spelling has been verified with the UNIX
%%%                        spell and GNU ispell programs using the
%%%                        exception dictionary stored in the
%%%                        companion file with extension .sok.
%%%
%%%                        BibTeX citation tags are uniformly chosen
%%%                        as name:year:abbrev, where name is the
%%%                        family name of the first author or editor,
%%%                        year is a 4-digit number, and abbrev is a
%%%                        3-letter condensation of important title
%%%                        words. Citation tags were automatically
%%%                        generated by software developed for the
%%%                        BibNet Project.
%%%
%%%                        In this bibliography, entries are sorted in
%%%                        publication order, using ``bibsort -byvolume.''
%%%
%%%                        The checksum field above contains a CRC-16
%%%                        checksum as the first value, followed by the
%%%                        equivalent of the standard UNIX wc (word
%%%                        count) utility output of lines, words, and
%%%                        characters.  This is produced by Robert
%%%                        Solovay's checksum utility.",
%%%  }
%%% ====================================================================
@Preamble{"\input bibnames.sty"
    # "\ifx \undefined \circled   \def \circled   #1{(#1)}        \fi"
    # "\ifx \undefined \pkg       \def \pkg       #1{{{\tt #1}}}  \fi"
    # "\ifx \undefined \reg       \def \reg         {\circled{R}} \fi"
}

%%% ====================================================================
%%% Acknowledgement abbreviations:
@String{ack-nhfb = "Nelson H. F. Beebe,
                    University of Utah,
                    Department of Mathematics, 110 LCB,
                    155 S 1400 E RM 233,
                    Salt Lake City, UT 84112-0090, USA,
                    Tel: +1 801 581 5254,
                    FAX: +1 801 581 4148,
                    e-mail: \path|beebe@math.utah.edu|,
                            \path|beebe@acm.org|,
                            \path|beebe@computer.org| (Internet),
                    URL: \path|https://www.math.utah.edu/~beebe/|"}

%%% ====================================================================
%%% Journal abbreviations:
@String{j-JETC                  = "ACM Journal on Emerging Technologies
                                  in Computing Systems (JETC)"}

%%% ====================================================================
%%% Bibliography entries:
@Article{Irwin:2005:E,
  author =       "Mary Jane Irwin and Vijaykrishnan Narayanan",
  title =        "Editorial",
  journal =      j-JETC,
  volume =       "1",
  number =       "1",
  pages =        "1--6",
  month =        apr,
  year =         "2005",
  CODEN =        "????",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Sat Sep 17 15:29:54 MDT 2005",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Narendra:2005:CDC,
  author =       "Siva G. Narendra",
  title =        "Challenges and design choices in nanoscale {CMOS}",
  journal =      j-JETC,
  volume =       "1",
  number =       "1",
  pages =        "7--49",
  month =        apr,
  year =         "2005",
  CODEN =        "????",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Sat Sep 17 15:29:54 MDT 2005",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Lim:2005:PPB,
  author =       "Sung Kyu Lim and Ramprasad Ravichandran and Mike
                 Niemier",
  title =        "Partitioning and placement for buildable {QCA}
                 circuits",
  journal =      j-JETC,
  volume =       "1",
  number =       "1",
  pages =        "50--72",
  month =        apr,
  year =         "2005",
  CODEN =        "????",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Sat Sep 17 15:29:54 MDT 2005",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Gojman:2005:EDS,
  author =       "Benjamin Gojman and Eric Rachlin and John E. Savage",
  title =        "Evaluation of design strategies for stochastically
                 assembled nanoarray memories",
  journal =      j-JETC,
  volume =       "1",
  number =       "2",
  pages =        "73--108",
  year =         "2005",
  CODEN =        "????",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Sat Sep 17 15:29:54 MDT 2005",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Dehon:2005:NBP,
  author =       "Andr{\'e} Dehon",
  title =        "Nanowire-based programmable architectures",
  journal =      j-JETC,
  volume =       "1",
  number =       "2",
  pages =        "109--162",
  year =         "2005",
  CODEN =        "????",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Sat Sep 17 15:29:54 MDT 2005",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Huang:2005:TBQ,
  author =       "J. Huang and M. Momenzadeh and L. Schiano and M.
                 Ottavi and F. Lombardi",
  title =        "Tile-based {QCA} design using majority-like logic
                 primitives",
  journal =      j-JETC,
  volume =       "1",
  number =       "3",
  pages =        "163--185",
  month =        oct,
  year =         "2005",
  CODEN =        "????",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Tue Mar 7 16:16:02 MST 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Chakrabarty:2005:DAM,
  author =       "Krishnendu Chakrabarty and Jun Zeng",
  title =        "Design automation for microfluidics-based biochips",
  journal =      j-JETC,
  volume =       "1",
  number =       "3",
  pages =        "186--223",
  month =        oct,
  year =         "2005",
  CODEN =        "????",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Tue Mar 7 16:16:02 MST 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Patwardhan:2006:NNS,
  author =       "Jaidev P. Patwardhan and Chris Dwyer and Alvin R.
                 Lebeck and Daniel J. Sorin",
  title =        "{NANA}: a nano-scale active network architecture",
  journal =      j-JETC,
  volume =       "2",
  number =       "1",
  pages =        "1--30",
  month =        jan,
  year =         "2006",
  CODEN =        "????",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Mon Aug 28 07:08:02 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{VanMeter:2006:AIQ,
  author =       "Rodney {Van Meter} and Mark Oskin",
  title =        "Architectural implications of quantum computing
                 technologies",
  journal =      j-JETC,
  volume =       "2",
  number =       "1",
  pages =        "31--63",
  month =        jan,
  year =         "2006",
  CODEN =        "????",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Mon Aug 28 07:08:02 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Xie:2006:DSE,
  author =       "Yuan Xie and Gabriel H. Loh and Bryan Black and Kerry
                 Bernstein",
  title =        "Design space exploration for {$3$D} architectures",
  journal =      j-JETC,
  volume =       "2",
  number =       "2",
  pages =        "65--103",
  month =        apr,
  year =         "2006",
  CODEN =        "????",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Mon Aug 28 07:08:02 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Su:2006:YER,
  author =       "Fei Su and Krishnendu Chakrabarty",
  title =        "Yield enhancement of reconfigurable
                 microfluidics-based biochips using interstitial
                 redundancy",
  journal =      j-JETC,
  volume =       "2",
  number =       "2",
  pages =        "104--128",
  month =        apr,
  year =         "2006",
  CODEN =        "????",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Mon Aug 28 07:08:02 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Savage:2006:RAN,
  author =       "John E. Savage and Eric Rachlin and Andr{\'e} DeHon
                 and Charles M. Lieber and Yue Wu",
  title =        "Radial addressing of nanowires",
  journal =      j-JETC,
  volume =       "2",
  number =       "2",
  pages =        "129--154",
  month =        apr,
  year =         "2006",
  CODEN =        "????",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Mon Aug 28 07:08:02 MDT 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Massoud:2006:MDC,
  author =       "Yehia Massoud and Arthur Nieuwoudt",
  title =        "Modeling and design challenges and solutions for
                 carbon nanotube-based interconnect in future high
                 performance integrated circuits",
  journal =      j-JETC,
  volume =       "2",
  number =       "3",
  pages =        "155--196",
  month =        jul,
  year =         "2006",
  CODEN =        "????",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Thu Nov 16 18:25:43 MST 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Tahoori:2006:AID,
  author =       "Mehdi B. Tahoori",
  title =        "Application-independent defect tolerance of
                 reconfigurable nanoarchitectures",
  journal =      j-JETC,
  volume =       "2",
  number =       "3",
  pages =        "197--218",
  month =        jul,
  year =         "2006",
  CODEN =        "????",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Thu Nov 16 18:25:43 MST 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Datta:2006:ADF,
  author =       "Kushal Datta and Arindam Mukherjee and Arun
                 Ravindran",
  title =        "Automated design flow for diode-based nanofabrics",
  journal =      j-JETC,
  volume =       "2",
  number =       "3",
  pages =        "219--241",
  month =        jul,
  year =         "2006",
  CODEN =        "????",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Thu Nov 16 18:25:43 MST 2006",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Ottavi:2006:HHE,
  author =       "Marco Ottavi and Luca Schiano and Fabrizio Lombardi
                 and Douglas Tougaw",
  title =        "{HDLQ}: {A HDL} environment for {QCA} design",
  journal =      j-JETC,
  volume =       "2",
  number =       "4",
  pages =        "243--261",
  month =        oct,
  year =         "2006",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1216396.1216397",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Fri Jun 20 11:03:17 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Emerging technologies have attracted a substantial
                 interest in overcoming the physical limitations of CMOS
                 as projected at the end of the Technology Roadmap;
                 among these technologies, quantum-dot cellular automata
                 (QCA) relies on different and novel paradigms to
                 implement dense, low power circuits and systems for
                 high-performance computing. As applicable to existing
                 technologies, a hierarchical process can be utilized to
                 facilitate the design of QCA circuits. Tools and
                 methodologies both at system and physical levels are
                 required to support all design phases. This article
                 presents an HDL model to describe QCA ``devices'' (also
                 referred elsewhere in the technical literature as
                 building blocks, i.e., majority voter, inverter, wire,
                 crossover) and facilitate the evaluation of their
                 design. This tool, referred to as HDLQ, allows a
                 designer to verify the logic characteristics of a QCA
                 system, while supporting within a design environment
                 different operational mechanisms (such as fault
                 injection) and the unique features of QCA (such as
                 bidirectionality and timing/clocking partitioning). The
                 applicability of this design environment to various
                 memory circuits for logic and timing verification is
                 presented in detail. Various defective conditions for
                 kinks due to thermodynamic effects and permanent faults
                 due to manufacturing defects are considered for
                 injection.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
  keywords =     "CAD; fault injection; HDL; QCA",
}

@Article{Davids:2006:MFD,
  author =       "Daniel Davids and Siddhartha Datta and Arindam
                 Mukherjee and Bharat Joshi and Arun Ravindran",
  title =        "Multiple fault diagnosis in digital microfluidic
                 biochips",
  journal =      j-JETC,
  volume =       "2",
  number =       "4",
  pages =        "262--276",
  month =        oct,
  year =         "2006",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1216396.1216398",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Fri Jun 20 11:03:17 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Microfluidics-based biochips consist of microfluidic
                 arrays on rigid substrates through which, movement of
                 fluids is tightly controlled to facilitate biological
                 reactions. Biochips are soon expected to revolutionize
                 biosensing, clinical diagnostics, and drug discovery.
                 Critical to the deployment of biochips in such diverse
                 areas is the dependability of these systems. Thus,
                 robust testing techniques are required to ensure an
                 adequate level of system dependability. Due to the
                 underlying mixed technology and energy domains, such
                 biochips exhibit unique failure mechanisms and defects.
                 In this article we present a highly effective fault
                 diagnosis strategy that uses a single source and sink
                 to detect and locate multiple faults in a microfluidic
                 array, without flooding the array, a problem that has
                 hampered realistic implementations of all existing
                 strategies. The strategy renders itself well for a
                 built-in self-test that could drastically reduce the
                 operating cost of microfluidic biochips. It can be used
                 during both the manufacturing phase of the biochip, as
                 well as field operation. Furthermore, the algorithm can
                 pinpoint the actual fault, as opposed to merely the
                 faulty regions that are typically identified by
                 strategies proposed in the literature. Also, analytical
                 results suggest that it is an effective strategy that
                 can be used to design highly dependable biochip
                 systems.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
  keywords =     "droplet flooding; faults tolerance; Microfluidic
                 biochip; multiple fault; testing",
}

@Article{Prasad:2006:DSA,
  author =       "Aditya K. Prasad and Vivek V. Shende and Igor L.
                 Markov and John P. Hayes and Ketan N. Patel",
  title =        "Data structures and algorithms for simplifying
                 reversible circuits",
  journal =      j-JETC,
  volume =       "2",
  number =       "4",
  pages =        "277--293",
  month =        oct,
  year =         "2006",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1216396.1216399",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Fri Jun 20 11:03:17 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Reversible logic is motivated by low-power design,
                 quantum circuits, and nanotechnology. We develop a
                 compact representation of small reversible circuits to
                 generate and store optimal circuits for all 40,320
                 three-input reversible functions, and millions of
                 four-input circuits. This allows implementing a
                 function optimally in constant time for use in the
                 peephole optimization of larger circuits produced by
                 existing techniques, and guarantees that every
                 three-bit subcircuit is optimal. To generate
                 subcircuits, we use a graph-based data structure and
                 algorithms for circuit restructuring. Finally, we
                 demonstrate a suboptimal circuit for which peephole
                 optimization fails.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
  keywords =     "circuit libraries; Circuit simplification; optimal
                 subcircuit",
}

@Article{Zhao:2007:PTM,
  author =       "Wei Zhao and Yu Cao",
  title =        "Predictive technology model for nano-{CMOS} design
                 exploration",
  journal =      j-JETC,
  volume =       "3",
  number =       "1",
  pages =        "1:1--1:??",
  month =        apr,
  year =         "2007",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1229175.1229176",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Fri Jun 20 11:03:25 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "A predictive MOSFET model is critical for early
                 circuit design research. In this work, a new generation
                 of Predictive Technology Model (PTM) is developed,
                 covering emerging physical effects and alternative
                 structures, such as the double-gate device (i.e.,
                 FinFET). Based on physical models and early stage
                 silicon data, PTM of bulk and double-gate devices are
                 successfully generated from 130nm to 32nm technology
                 nodes, with effective channel length down to 13nm. By
                 tuning only ten primary parameters, PTM can be easily
                 customized to cover a wide range of process
                 uncertainties. The accuracy of PTM predictions is
                 comprehensively verified with published silicon data:
                 the error of the current is below 10\\% for both NMOS
                 and PMOS. Furthermore, the new PTM correctly captures
                 process sensitivities in the nanometer regime. PTM is
                 available online at http://www.eas.asu.edu/~ptm.",
  acknowledgement = ack-nhfb,
  articleno =    "1",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
  keywords =     "early design exploration; FinFET; predictive modeling;
                 process variations; Technology scaling",
}

@Article{Schulhof:2007:SRC,
  author =       "Gabriel Schulhof and Konrad Walus and Graham A.
                 Jullien",
  title =        "Simulation of random cell displacements in {QCA}",
  journal =      j-JETC,
  volume =       "3",
  number =       "1",
  pages =        "2:1--2:??",
  month =        apr,
  year =         "2007",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1229175.1229177",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Fri Jun 20 11:03:25 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "We analyze the behavior of quantum-dot cellular
                 automata (QCA) building blocks in the presence of
                 random cell displacements. The QCA cells are modeled
                 using the coherence vector description and simulated
                 using QCADesigner. We evaluate various fundamental
                 circuits: the wire, the inverter, the majority gate,
                 and the two-wire crossing approaches: the coplanar
                 crossover and the multilayer crossover. Our results
                 show that different building blocks have different
                 displacement tolerances. The coplanar crossover and
                 inverter perform the weakest. The wire is the most
                 robust. We have found displacement tolerances to be a
                 function of circuit layout and geometry rather than
                 cell size.",
  acknowledgement = ack-nhfb,
  articleno =    "2",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
  keywords =     "fabrication variances; fault tolerance; QCA;
                 Quantum-dot cellular automata",
}

@Article{Rose:2007:DCM,
  author =       "Garrett S. Rose and Yuxing Yao and James M. Tour and
                 Adam C. Cabe and Nadine Gergel-Hackett and Nabanita
                 Majumdar and John C. Bean and Lloyd R. Harriott and
                 Mircea R. Stan",
  title =        "Designing {CMOS}\slash molecular memories while
                 considering device parameter variations",
  journal =      j-JETC,
  volume =       "3",
  number =       "1",
  pages =        "3:1--3:??",
  month =        apr,
  year =         "2007",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1229175.1229178",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Fri Jun 20 11:03:25 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "In recent years, many advances have been made in the
                 development of molecular scale devices. Experimental
                 data shows that these devices have potential for use in
                 both memory and logic. This article describes the
                 challenges faced in building crossbar array-based
                 molecular memory and develops a methodology to optimize
                 molecular scale architectures based on experimental
                 device data taken at room temperature. In particular,
                 issues in reading and writing such as memory using CMOS
                 are discussed, and a solution is introduced for easily
                 reading device conductivity states (typically
                 characterized by very small currents). Additionally, a
                 metric is derived to determine the voltages for writing
                 to the crossbar array. The proposed memory design is
                 also simulated with consideration to device parameter
                 variations. Thus, the results presented here shed light
                 on important design choices to be made at multiple
                 abstraction levels, from devices to architectures.
                 Simulation results, incorporating experimental device
                 data, are presented using Cadence Spectre.",
  acknowledgement = ack-nhfb,
  articleno =    "3",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
  keywords =     "CMOS; molecular electronics; nanotechnology",
}

@Article{McKee:2007:ESI,
  author =       "Sally A. McKee",
  title =        "Editorial to special issue on reliable computing",
  journal =      j-JETC,
  volume =       "3",
  number =       "2",
  pages =        "4:1--4:??",
  month =        jul,
  year =         "2007",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1265949.1265950",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Fri Jun 20 11:03:32 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Eshaghian-Wilner:2007:SWN,
  author =       "Mary M. Eshaghian-Wilner and Alex Khitun and Shiva
                 Navab and Kang L. Wang",
  title =        "The spin-wave nanoscale reconfigurable mesh and the
                 labeling problem",
  journal =      j-JETC,
  volume =       "3",
  number =       "2",
  pages =        "5:1--5:??",
  month =        jul,
  year =         "2007",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1265949.1265951",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Fri Jun 20 11:03:32 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "In this article, we present a nanoscale reconfigurable
                 mesh which is interconnected by ferromagnetic spin-wave
                 buses. In this architecture, unlike the traditional
                 spin-based nano structures which transmit charge, waves
                 are transmitted. As a result, the power consumption of
                 the proposed modules can be low. This reconfigurable
                 mesh, while requiring the same number of switches and
                 buses as the standard reconfigurable mesh, is capable
                 of simultaneously transmitting $N$ waves on each of the
                 spin-wave buses. Because of this highly parallel
                 feature, very fast and fault-tolerant algorithms can be
                 designed. To illustrate the superior performance of the
                 proposed spin-wave reconfigurable mesh, we present
                 three fast labeling algorithms.",
  acknowledgement = ack-nhfb,
  articleno =    "5",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
  keywords =     "image processing; nanoscale architectures;
                 reconfigurable mesh; Spin waves",
}

@Article{Prodan:2007:DDE,
  author =       "Lucian Prodan and Mihai Udrescu and Oana Boncalo and
                 Mircea Vladutiu",
  title =        "Design for dependability in emerging technologies",
  journal =      j-JETC,
  volume =       "3",
  number =       "2",
  pages =        "6:1--6:??",
  month =        jul,
  year =         "2007",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1265949.1265952",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Fri Jun 20 11:03:32 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "As current microelectronics will reach their physical
                 limits within the foreseeable future, emerging
                 technologies may offer a solution for maintaining the
                 trends to increase computing performance.
                 Biologically-inspired and quantum computing represent
                 two emerging technology vectors for novel computing
                 architectures within nanoelectronics. However,
                 potential benefits will come at the cost of increased
                 device sensitivity to the surrounding environment. This
                 article provides a dependability perspective over these
                 technologies from a designer's standpoint. Maintaining
                 or increasing the dependability of unconventional
                 computational processes is discussed in two different
                 contexts, a bio-inspired computing architecture (the
                 Embryonics project) and a quantum computational
                 architecture (the QUERIST project).",
  acknowledgement = ack-nhfb,
  articleno =    "6",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
  keywords =     "bio-inspired computing; bio-inspired digital design;
                 Dependability; Embryonics; emerging technologies;
                 evolvable hardware; fault-tolerance assessment; quantum
                 computing; reliability",
}

@Article{Tyrrell:2007:ED,
  author =       "Andy M. Tyrrell and Andrew J. Greensted",
  title =        "Evolving dependability",
  journal =      j-JETC,
  volume =       "3",
  number =       "2",
  pages =        "7:1--7:??",
  month =        jul,
  year =         "2007",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1265949.1265953",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Fri Jun 20 11:03:32 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Evolvable hardware offers much for the future of
                 complex systems design. Evolutionary techniques not
                 only have the potential for larger solution space
                 coverage, but when implemented on hardware, also allow
                 system designs to adapt to changes in the environment,
                 including failures in system components. This article
                 reviews a number of novel techniques, all based in the
                 field of bio-inspired systems, that provide varying
                 degrees of dependability over and above standard
                 designs. In particular, three different techniques are
                 considered: using FPGAs and ideas from developmental
                 biology to create designs that possess emergent
                 fault-tolerant properties, using FPGAs and continuous
                 evolution to circumvent faults as and when they occur,
                 and, finally, we consider a novel ASIC designed and
                 built with bio-inspired systems in mind.",
  acknowledgement = ack-nhfb,
  articleno =    "7",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
  keywords =     "bio-inspired architectures; Evolutionary algorithms;
                 fault tolerance; RISA architecture",
}

@Article{Sekanina:2007:EFR,
  author =       "Luk{\'a}{\v{s}} Sekanina",
  title =        "Evolutionary functional recovery in virtual
                 reconfigurable circuits",
  journal =      j-JETC,
  volume =       "3",
  number =       "2",
  pages =        "8:1--8:??",
  month =        jul,
  year =         "2007",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1265949.1265954",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Fri Jun 20 11:03:32 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "A virtual reconfigurable circuit (VRC) is a
                 domain-specific reconfigurable device developed using
                 an ordinary FPGA in order to easily implement evolvable
                 hardware applications. While a fast partial runtime
                 reconfiguration and application-specific programmable
                 elements represent the main advantages of VRC, the main
                 disadvantage of the VRC is the area consumed. This
                 study describes experiments conducted to estimate how
                 the use of VRC influences the dependability of
                 FPGA-based evolvable systems. It is shown that these
                 systems are not as sensitive to faults as their
                 area-demanding implementations might suggest. An
                 evolutionary algorithm is utilized to design fault
                 tolerant circuits as well as to perform an automatic
                 functional recovery when faults are detected in the
                 configuration memory of the FPGA. All the experiments
                 are performed on models of reconfigurable devices.",
  acknowledgement = ack-nhfb,
  articleno =    "8",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
  keywords =     "Dependability; evolutionary algorithms; evolvable
                 hardware; FPGA",
}

@Article{Tempesti:2007:SRH,
  author =       "Gianluca Tempesti and Daniel Mange and Pierre-Andre
                 Mudry and Jo{\"e}l Rossier and Andre Stauffer",
  title =        "Self-replicating hardware for reliability: {The
                 Embryonics Project}",
  journal =      j-JETC,
  volume =       "3",
  number =       "2",
  pages =        "9:1--9:??",
  month =        jul,
  year =         "2007",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1265949.1265955",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Fri Jun 20 11:03:32 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "The multicellular structure of biological organisms
                 and the interpretation in each of their cells of a
                 chemical program (the DNA string or genome ) is the
                 source of inspiration for the Embryonics (embryonic
                 electronics) project, whose final objective is the
                 design of highly robust integrated circuits, endowed
                 with properties usually associated with the living
                 world: self-repair and self-replication. In this
                 article, we provide an overview of our latest research
                 in the domain of the self-replication of processing
                 elements within a programmable logic substrate, a key
                 prerequisite for achieving system-level fault tolerance
                 in our bio-inspired approach.",
  acknowledgement = ack-nhfb,
  articleno =    "9",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
  keywords =     "Bio-inspired architectures; embryonic electronics;
                 growth; hierarchical fault tolerance; self-repair;
                 self-replication",
}

@Article{Patwardhan:2007:SOD,
  author =       "Jaidev Patwardhan and Chris Dwyer and Alvin R.
                 Lebeck",
  title =        "A self-organizing defect tolerant {SIMD}
                 architecture",
  journal =      j-JETC,
  volume =       "3",
  number =       "2",
  pages =        "10:1--10:??",
  month =        jul,
  year =         "2007",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1265949.1265956",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Fri Jun 20 11:03:32 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "The continual decrease in transistor size (through
                 either scaled CMOS or emerging nanotechnologies)
                 promises to usher in an era of tera to peta-scale
                 integration but with increasing defects. Regardless of
                 fabrication methodology (top-down or bottom-up),
                 defect-tolerant architectures are necessary to exploit
                 the full potential of future increased device
                 densities.\par

                 This article explores a defect-tolerant SIMD
                 architecture (SOSA) that self-organizes a large number
                 of limited capability nodes with high defect rates into
                 SIMD processing elements. Simulation results show that
                 SOSA matches or exceeds the performance of conventional
                 systems for moderate to large problems, but with lower
                 power density.",
  acknowledgement = ack-nhfb,
  articleno =    "10",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
  keywords =     "bit-serial; data parallel; defect tolerance; DNA;
                 nanocomputing; Self-organizing; SIMD",
}

@Article{Chakrabarty:2007:ESI,
  author =       "Krishnendu Chakrabarty and Sachin Sapatnekar",
  title =        "Editorial to special issue {DAC 2006}",
  journal =      j-JETC,
  volume =       "3",
  number =       "3",
  pages =        "11:1--11:??",
  month =        nov,
  year =         "2007",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1295231.1295232",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Fri Jun 20 11:03:49 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  acknowledgement = ack-nhfb,
  articleno =    "11",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Paul:2007:PBC,
  author =       "Bipul C. Paul and Shinobu Fujita and Masaki Okajima
                 and Thomas Lee",
  title =        "Prospect of ballistic {CNFET} in high performance
                 applications: {Modeling} and analysis",
  journal =      j-JETC,
  volume =       "3",
  number =       "3",
  pages =        "12:1--12:??",
  month =        nov,
  year =         "2007",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1295231.1295233",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Fri Jun 20 11:03:49 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "With the advent of carbon nanotube technology,
                 evaluating circuit and system performance using these
                 devices is becoming extremely important. In this
                 article, we present a quasi-analytical device model for
                 intrinsic ballistic CNFET, which can be used in any
                 conventional circuit simulator like SPICE. This simple
                 quasi-analytical model is effective in a wide variety
                 of CNFET structures as well as for a wide range of
                 operating conditions in the digital circuit application
                 domain. We also provide insight into how the parasitic
                 fringe capacitance in state-of-the-art CNFET geometries
                 impacts the overall performance of CNFET circuits. We
                 show that unless the device width can be significantly
                 reduced, the effective gate capacitance of CNFET will
                 be strongly dominated by the parasitic fringe
                 capacitances, and the superior performance of intrinsic
                 CNFET over silicon MOSFET cannot be achieved in
                 circuit. We further show that unlike conventional
                 MOSFET, nanotube FETs are significantly less sensitive
                 to many process parameter variations due to their
                 inherent device structures and cylindrical gate
                 geometry.",
  acknowledgement = ack-nhfb,
  articleno =    "12",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
  keywords =     "Ballistic carbon nanotube FET (CNFET); circuit
                 compatible model; circuit performance; parasitic
                 capacitance; process variability",
}

@Article{Yuh:2007:PDT,
  author =       "Ping-Hung Yuh and Chia-Lin Yang and Yao-Wen Chang",
  title =        "Placement of defect-tolerant digital microfluidic
                 biochips using the {$T$}-tree formulation",
  journal =      j-JETC,
  volume =       "3",
  number =       "3",
  pages =        "13:1--13:??",
  month =        nov,
  year =         "2007",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1295231.1295234",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Fri Jun 20 11:03:49 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Droplet-based microfluidic biochips have recently
                 gained much attention and are expected to revolutionize
                 the biological laboratory procedures. As biochips are
                 adopted for the complex procedures in molecular
                 biology, its complexity is expected to increase due to
                 the need of multiple and concurrent assays on a chip.
                 In this article, we formulate the placement problem of
                 digital microfluidic biochips with a tree-based
                 topological representation, called $T$-tree. To the
                 best knowledge of the authors, this is the first work
                 that adopts a topological representation to solve the
                 placement problem of digital microfluidic biochips. We
                 also consider the defect tolerant issue to avoid to use
                 defective cells due to fabrication. Experimental
                 results demonstrate that our approach is more efficient
                 and effective than the previous unified synthesis and
                 placement framework.",
  acknowledgement = ack-nhfb,
  articleno =    "13",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
  keywords =     "biochip; Microfluidics; placement",
}

@Article{Xu:2007:ADP,
  author =       "Tao Xu and William L. Hwang and Fei Su and Krishnendu
                 Chakrabarty",
  title =        "Automated design of pin-constrained digital
                 microfluidic biochips under droplet-interference
                 constraints",
  journal =      j-JETC,
  volume =       "3",
  number =       "3",
  pages =        "14:1--14:??",
  month =        nov,
  year =         "2007",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1295231.1295235",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Fri Jun 20 11:03:49 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Microfluidics-based biochips, also referred to as
                 lab-on-a-chip, are devices that integrate
                 fluid-handling functions such as sample preparation,
                 analysis, separation, and detection. This emerging
                 technology combines electronics with biology to open
                 new application areas such as point-of-care diagnosis,
                 on-chip DNA analysis, and automated drug discovery. We
                 propose a design automation method for pin-constrained
                 biochips that manipulate nanoliter volumes of discrete
                 droplets on a microfluidic array. In contrast to the
                 direct-addressing scheme that has been studied thus far
                 in the literature, we assign a small number of
                 independent control pins to a large number of
                 electrodes in the biochip, thereby reducing design
                 complexity and product cost. The design procedure
                 relies on a droplet-trace-based array partitioning
                 scheme and an efficient pin assignment technique,
                 referred to as the ``Connect-5 algorithm.'' The
                 proposed method is evaluated using a set of multiplexed
                 bioassays.",
  acknowledgement = ack-nhfb,
  articleno =    "14",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
  keywords =     "biochips; droplets; microfluidics; Physical design
                 automation",
}

@Article{Rad:2007:EAP,
  author =       "Reza M. P. Rad and Mohammad Tehranipoor",
  title =        "Evaluating area and performance of hybrid {FPGAs} with
                 nanoscale clusters and {CMOS} routing",
  journal =      j-JETC,
  volume =       "3",
  number =       "3",
  pages =        "15:1--15:??",
  month =        nov,
  year =         "2007",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1295231.1295236",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Fri Jun 20 11:03:49 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Advances in fabrication technology of nanoscale
                 devices such as nanowires, carbon nanotubes and
                 molecular switches provide new opportunities for
                 implementing cluster-based FPGAs. Extensive research is
                 needed to evaluate area and performance of FPGAs made
                 from these devices and compare with their CMOS
                 counterparts. In this work, we propose a hybrid FPGA
                 that uses nanoscale clusters with a functionality
                 similar to the clusters of traditional CMOS FPGAs. The
                 proposed cluster is constructed by a crossbar of
                 nanowires and can be configured to implement the
                 required LUTs and intracluster MUXes. A CMOS interface
                 is also proposed to provide configuration and memory
                 elements for the nanoscale cluster. In the proposed
                 architecture, inter-cluster routing remains at CMOS
                 scale. We have developed models for area and delay of
                 clusters and interconnects of the proposed hybrid FPGA.
                 FPGA tools are configured with these models and used to
                 synthesize and configure the benchmark circuits onto
                 the hybrid FPGAs with NiSi nanowires or nanotubes.
                 Experiments are conducted to evaluate and compare area
                 and performance of the hybrid FPGA and traditional CMOS
                 FPGA (scaled to 22nm). Up to 82\\% area reduction was
                 obtained from implementing MCNC benchmarks on the
                 hybrid FPGA. Performance of the hybrid FPGA is shown to
                 be close to that of CMOS FPGA.",
  acknowledgement = ack-nhfb,
  articleno =    "15",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
  keywords =     "CMOS; FPGA; Nanotechnology; performance; reliability",
}

@Article{Su:2008:HLS,
  author =       "Fei Su and Krishnendu Chakrabarty",
  title =        "High-level synthesis of digital microfluidic
                 biochips",
  journal =      j-JETC,
  volume =       "3",
  number =       "4",
  pages =        "1:1--1:??",
  month =        jan,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1324177.1324178",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Fri Jun 20 11:04:00 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Microfluidic biochips offer a promising platform for
                 massively parallel DNA analysis, automated drug
                 discovery, and real-time biomolecular recognition.
                 Current techniques for full-custom design of
                 droplet-based ``digital'' biochips do not scale well
                 for concurrent assays and for next-generation
                 system-on-chip (SOC) designs that are expected to
                 include microfluidic components. We propose a system
                 design methodology that attempts to apply classical
                 high-level synthesis techniques to the design of
                 digital microfluidic biochips. We focus here on the
                 problem of scheduling bioassay functions under resource
                 constraints. We first develop an optimal scheduling
                 strategy based on integer linear programming. However,
                 because the scheduling problem is NP-complete, we also
                 develop two heuristic techniques that scale well for
                 large problem instances. A clinical diagnostic
                 procedure, namely multiplexed in-vitro diagnostics on
                 human physiological fluids, is first used to illustrate
                 and evaluate the proposed method. Next, the synthesis
                 approach is applied to a protein assay, which serves as
                 a more complex bioassay application. The proposed
                 synthesis approach is expected to reduce human effort
                 and design cycle time, and it will facilitate the
                 integration of microfluidic components with
                 microelectronic components in next-generation SOCs.",
  acknowledgement = ack-nhfb,
  articleno =    "1",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
  keywords =     "biochips; High-level synthesis; microfluidics;
                 scheduling; system-on-chip",
}

@Article{VanMeter:2008:ADM,
  author =       "Rodney {Van Meter} and W. J. Munro and Kae Nemoto and
                 Kohei M. Itoh",
  title =        "Arithmetic on a distributed-memory quantum
                 multicomputer",
  journal =      j-JETC,
  volume =       "3",
  number =       "4",
  pages =        "2:1--2:??",
  month =        jan,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1324177.1324179",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Fri Jun 20 11:04:00 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "We evaluate the performance of quantum arithmetic
                 algorithms run on a distributed quantum computer (a
                 quantum multicomputer). We vary the node capacity and
                 I/O capabilities, and the network topology. The
                 tradeoff of choosing between gates executed remotely,
                 through ``teleported gates'' on entangled pairs of
                 qubits (telegate), versus exchanging the relevant
                 qubits via quantum teleportation, then executing the
                 algorithm using local gates (teledata), is examined. We
                 show that the teledata approach performs better, and
                 that carry-ripple adders perform well when the
                 teleportation block is decomposed so that the key
                 quantum operations can be parallelized. A node size of
                 only a few logical qubits performs adequately provided
                 that the nodes have two transceiver qubits. A linear
                 network topology performs acceptably for a broad range
                 of system sizes and performance parameters. We
                 therefore recommend pursuing small, high-I/O bandwidth
                 nodes and a simple network. Such a machine will run
                 Shor's algorithm for factoring large numbers
                 efficiently.",
  acknowledgement = ack-nhfb,
  articleno =    "2",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
  keywords =     "quantum computer architecture; Quantum computing",
}

@Article{Ma:2008:MCE,
  author =       "Xiaojun Ma and Jing Huang and Fabrizio Lombardi",
  title =        "A model for computing and energy dissipation of
                 molecular {QCA} devices and circuits",
  journal =      j-JETC,
  volume =       "3",
  number =       "4",
  pages =        "3:1--3:??",
  month =        jan,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1324177.1324180",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Fri Jun 20 11:04:00 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Quantum-dot Cellular Automata is an emerging
                 technology that offers significant improvements over
                 CMOS. Recently QCA has been advocated as a technology
                 for implementing reversible computing. However,
                 existing tools for QCA design and evaluation have
                 limited capabilities. This paper presents a new
                 mechanical-based model for computing in QCA. By
                 avoiding a full quantum-thermodynamical calculation, it
                 offers a classical view of the principles of QCA
                 operation and can be used in evaluating energy
                 dissipation for reversible computing. The proposed
                 model is mechanically based and is applicable to
                 six-dot (neutrally charged) QCA cells for molecular
                 implementation. The mechanical model consists of a
                 sleeve of changing shape; four electrically charged
                 balls are connected by a stick that rotates around an
                 axle in the sleeve. The sleeve acts as a clocking unit,
                 while the angular position of the stick within the
                 changing shape of the sleeve, identifies the phase for
                 quasi-adiabatic switching. A thermodynamic analysis of
                 the proposed model is presented. The behaviors of
                 various QCA basic devices and circuits are analyzed
                 using the proposed model. It is shown that the proposed
                 model is capable of evaluating the energy consumption
                 for reversible computing at device and circuit levels
                 for molecular QCA implementation. As applicable to QCA,
                 two clocking schemes are also analyzed for energy
                 dissipation and performance (in terms of number of
                 clocking zones).",
  acknowledgement = ack-nhfb,
  articleno =    "3",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
  keywords =     "emerging technology; QCA; reversible computing;
                 thermodynamic analysis",
}

@Article{Chuang:2008:SRS,
  author =       "Min-Lun Chuang and Chun-Yao Wang",
  title =        "Synthesis of reversible sequential elements",
  journal =      j-JETC,
  volume =       "3",
  number =       "4",
  pages =        "4:1--4:??",
  month =        jan,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1324177.1324181",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Fri Jun 20 11:04:00 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "To construct a reversible sequential circuit,
                 reversible sequential elements are required. This work
                 presents novel designs of reversible sequential
                 elements such as the $D$ latch, $ J K$ latch, and $T$
                 latch. Based on these reversible latches, we construct
                 the designs of the corresponding flip-flops. Then we
                 further discuss the physical implementations of our
                 designs based on electron waveguide $Y$-branch switch
                 technology. Test costs, including test generation and
                 test application, of reversible sequential circuits
                 with these reversible flip-flops are also discussed.
                 Compared with previous work, the implementation cost of
                 our new designs, including the number of gates and the
                 number of garbage outputs, is significantly reduced.
                 The number of gates in our designs is 47.4\\% of the
                 designs in previous work on average. The number of
                 garbage outputs in our designs is 25\\% of the designs
                 in previous work on average.",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
  keywords =     "Reversible logic; sequential circuits; sequential
                 elements",
}

@Article{Metodi:2008:HLI,
  author =       "Tzvetan S. Metodi and Darshan D. Thaker and Andrew W.
                 Cross and Isaac L. Chuang and Frederic T. Chong",
  title =        "High-level interconnect model for the quantum logic
                 array architecture",
  journal =      j-JETC,
  volume =       "4",
  number =       "1",
  pages =        "1:1--1:??",
  month =        mar,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1330521.1330522",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Fri Jun 20 11:04:09 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "We summarize the main characteristics of the quantum
                 logic array (QLA) architecture with a careful look at
                 the key issues not described in the original conference
                 publications: primarily, the teleportation-based
                 logical interconnect. The design goal of the quantum
                 logic array architecture is to illustrate a model for a
                 large-scale quantum architecture that solves the
                 primary challenges of system-level reliability and data
                 distribution over large distances. The QLA's logical
                 interconnect design, which employs the quantum repeater
                 protocol, is in principle capable of supporting the
                 communication requirements for applications as large as
                 the factoring of a 2048-bit number using Shor's quantum
                 factoring algorithm. Our physical-level assumptions and
                 architectural component validations are based on the
                 trapped ion technology for implementing quantum
                 computing.",
  acknowledgement = ack-nhfb,
  articleno =    "1",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
  keywords =     "fault tolerance; large scale; QLA; quantum; Quantum
                 computer architecture design; teleportation",
}

@Article{Donald:2008:RLS,
  author =       "James Donald and Niraj K. Jha",
  title =        "Reversible logic synthesis with {Fredkin} and {Peres}
                 gates",
  journal =      j-JETC,
  volume =       "4",
  number =       "1",
  pages =        "2:1--2:??",
  month =        mar,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1330521.1330523",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Fri Jun 20 11:04:09 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Reversible logic has applications in low-power
                 computing and quantum computing. Most reversible logic
                 synthesis methods are tied to particular gate types,
                 and cannot synthesize large functions. This article
                 extends RMRLS, a reversible logic synthesis tool, to
                 include additional gate types. While classic RMRLS can
                 synthesize functions using NOT, CNOT, and $n$-bit
                 Toffoli gates, our work details the inclusion of
                 $n$-bit Fredkin and Peres gates. We find that these
                 additional gates reduce the average gate count for
                 three-variable functions from 6.10 to 4.56, and improve
                 the synthesis results of many larger functions, both in
                 terms of gate count and quantum cost.",
  acknowledgement = ack-nhfb,
  articleno =    "2",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
  keywords =     "Quantum computing; reversible logic",
}

@Article{Guiducci:2008:HPP,
  author =       "Carlotta Guiducci and Christine Nardini",
  title =        "High parallelism, portability, and broad
                 accessibility: {Technologies} for genomics",
  journal =      j-JETC,
  volume =       "4",
  number =       "1",
  pages =        "3:1--3:??",
  month =        mar,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1330521.1330524",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Fri Jun 20 11:04:09 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Biotechnology is an area of great innovations that
                 promises to have deep impact on everyday life thanks to
                 profound changes in biology, medicine, and health care.
                 This article will span from the description of the
                 biochemical principles of molecular biology to the
                 definition of the physics that supports the technology
                 and to the devices and algorithms necessary to observe
                 molecular events in a controlled, portable, and highly
                 parallel manner. Throughout this discussion, constant
                 attention will be given to the ultimate goals and
                 applications of these innovations as well as to the
                 related issues.",
  acknowledgement = ack-nhfb,
  articleno =    "3",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
  keywords =     "biosensors; Genomics; microarrays; point-of-care
                 diagnostics",
}

@Article{Narayanan:2008:E,
  author =       "Vijaykrishnan Narayanan",
  title =        "Editorial",
  journal =      j-JETC,
  volume =       "4",
  number =       "2",
  pages =        "4:1--4:??",
  month =        apr,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1350763.1350764",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Fri Jun 20 11:04:16 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Bahar:2008:IJA,
  author =       "R. Iris Bahar and Krishnendu Chakrabarty",
  title =        "Introduction to joint {ACM JETC\slash TODAES} special
                 issue on new, emerging, and specialized technologies",
  journal =      j-JETC,
  volume =       "4",
  number =       "2",
  pages =        "5:1--5:??",
  month =        apr,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1350763.1350765",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Fri Jun 20 11:04:16 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  acknowledgement = ack-nhfb,
  articleno =    "5",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Kuo:2008:MSA,
  author =       "Shih-Hsien Kuo and Bruce Tidor and Jacob White",
  title =        "A meshless, spectrally accurate, integral equation
                 solver for molecular surface electrostatics",
  journal =      j-JETC,
  volume =       "4",
  number =       "2",
  pages =        "6:1--6:??",
  month =        apr,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1350763.1350766",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Fri Jun 20 11:04:16 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "The need to determine electrostatic fields in domains
                 bounded by molecular surfaces arises in a number of
                 nanotechnology applications including: biomolecule
                 design, carbon nanotube simulation, and molecular
                 electron transport analysis. Molecular surfaces are
                 typically smooth, without the corners common in
                 electrical interconnect problems, but are often so
                 geometrically complicated that numerical evaluation of
                 the associated electrostatic fields is extremely
                 time-consuming. In this paper we describe and
                 demonstrate a meshless spectrally-accurate integral
                 equation method that only requires a description of the
                 molecular surface in the form of a collection of
                 surface points. Our meshless method is a synthesis of
                 techniques, suitably adapted, including: spherical
                 harmonic surface interpolation, spectral-element-like
                 integral equation discretization, integral
                 desingularization via variable transformation, and
                 matrix-implicit iterative matrix solution. The spectral
                 accuracy of this combined method is verified using
                 analytically solvable sphere and ellipsoid problems,
                 and then its accuracy and efficiency is demonstrated
                 numerically by solving capacitance and coupled
                 Poisson\slash linearized Poisson--Boltzmann problems
                 associated with a commonly used model of a molecule in
                 solution. The results demonstrate that for a tolerance
                 of 10$^{-3}$ this new approach reduces the number of
                 unknowns by as much as two orders of magnitude over the
                 more commonly used flat panel methods.",
  acknowledgement = ack-nhfb,
  articleno =    "6",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
  keywords =     "integral equation; meshless; Poisson--Boltzmann
                 equation; spectral method",
}

@Article{Deng:2008:CNT,
  author =       "Jie Deng and Albert Lin and Gordon C. Wan and H.-S.
                 Philip Wong",
  title =        "Carbon nanotube transistor compact model for circuit
                 design and performance optimization",
  journal =      j-JETC,
  volume =       "4",
  number =       "2",
  pages =        "7:1--7:??",
  month =        apr,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1350763.1350767",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Fri Jun 20 11:04:16 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "In this paper, we describe the development of the
                 Stanford University Carbon Nanotube FET (CNFET) Compact
                 Model. The CNFET Model is a circuit-compatible, compact
                 model which describes enhancement-mode, CMOS-like
                 CNFETs. It can be used to simulate both functionality
                 and performance of large-scale circuits with hundreds
                 of CNFETs. To produce realistic and relevant results,
                 the model accounts for several practical non-idealities
                 such as scattering in the near-ballistic channel,
                 effects of the source/drain extension region, and
                 charge-screening for multiple-nanotube CNFETs. The
                 model also includes a full transcapacitance network for
                 more accurate transient and AC results. The Stanford
                 University CNFET Model is implemented in both HSPICE
                 macro language and VerilogA. The VerilogA
                 implementation shows speedups of roughly $ 7 \times $
                 -- $ 15 \times $ over HSPICE. Applications of the model
                 suggest that $n$- and $p$-CNFETs will have $ 6 \times $
                 and $ 13 \times $ speed advantage over Si $n$- and
                 $p$-MOSFETs respectively at the 32nm node, and that a
                 CNT density of 250 CNTs/$ \mu $ m is ideal for
                 multiple-nanotube gates. Such a compact CNFET model
                 will be absolutely essential in ushering in the Design
                 Era of CNFET circuits as carbon nanotube technology
                 outgrows its ``science discovery'' phase.",
  acknowledgement = ack-nhfb,
  articleno =    "7",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
  keywords =     "carbon nanotube FET; CNT; compact model; HSPICE;
                 VerilogA",
}

@Article{Carmona:2008:FMA,
  author =       "Josep Carmona and Jordi Cortadella and Yousuke Takada
                 and Ferdinand Peper",
  title =        "Formal methods for the analysis and synthesis of
                 nanometer-scale cellular arrays",
  journal =      j-JETC,
  volume =       "4",
  number =       "2",
  pages =        "8:1--8:??",
  month =        apr,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1350763.1350768",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Fri Jun 20 11:04:16 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Nanometer-scale structures suitable for computing have
                 been investigated by several research groups in recent
                 years. A common feature of these structures is their
                 dynamic evolution through cascaded local interactions
                 embedded on a discrete grid. Finding configurations
                 capable of conducting computations is a task that often
                 requires tedious experiments in laboratories. Formal
                 methods --- though used extensively for the
                 specification and verification of software and hardware
                 computing systems --- are virtually unexplored with
                 respect to computational structures at atomic scales.
                 This paper presents a systematic approach toward the
                 application of formal methods in this context, using
                 techniques like abstraction, model-checking, and
                 symbolic representations of states to explore and
                 discover computational structures. The proposed
                 techniques are applied to a system of CO molecules on a
                 grid of Copper atoms, resulting in the design of a
                 complete library of combinational logic gates based on
                 this molecular system. The techniques are also applied
                 on (more general) systems of cellular automata that
                 employ an asynchronous mode of timing. The use of
                 formal methods may narrow the gap between Physical
                 Chemistry and Computer Science, allowing the
                 description of interactions of nanometer scale systems
                 on a level of abstraction suitable to devise computing
                 devices.",
  acknowledgement = ack-nhfb,
  articleno =    "8",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
  keywords =     "cellular array; model checking; Nanocomputing;
                 symbolic techniques",
}

@Article{Crocker:2008:MQD,
  author =       "Michael Crocker and Michael Niemier and X. Sharon Hu
                 and Marya Lieberman",
  title =        "Molecular {QCA} design with chemically reasonable
                 constraints",
  journal =      j-JETC,
  volume =       "4",
  number =       "2",
  pages =        "9:1--9:??",
  month =        apr,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1350763.1350769",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Fri Jun 20 11:04:16 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "In this article we examine the impacts of the
                 fundamental constraints required for circuits and
                 systems made from molecular Quantum-dot Cellular
                 Automata (QCA) devices. Our design constraints are
                 ``chemically reasonable'' in that we consider the
                 characteristics and dimensions of devices and
                 scaffoldings that have actually been fabricated. This
                 work is a necessary first step for any work in QCA CAD,
                 and can also help shape experiments in the physical
                 sciences for emerging, nano-scale devices. Our work
                 shows that QCA circuits, scaffoldings, substrates, and
                 devices should all be considered simultaneously.
                 Otherwise, there is a very real possibility that the
                 devices and scaffoldings that are eventually
                 manufactured will result in devices that only work in
                 isolation. ``Chemically reasonable'' also means that
                 expected manufacturing defects must be considered. In
                 our simulations we introduce defects associated with
                 self-assembled systems into various designs to begin to
                 define manufacturing tolerances. This work is
                 especially timely as experimentalists are beginning to
                 work on merging experimental tracks that address
                 devices and scaffolds --- and the end result should
                 facilitate correct logical operations.",
  acknowledgement = ack-nhfb,
  articleno =    "9",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
  keywords =     "defects; Nanotechnology; physical simulation;
                 quantum-dot cellular automata",
}

@Article{Lebeck:2008:IDS,
  author =       "Alvin R. Lebeck and Krishnendu Chakrabarty",
  title =        "Introduction to {DAC 2007} special section",
  journal =      j-JETC,
  volume =       "4",
  number =       "3",
  pages =        "10:1--10:??",
  month =        aug,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1389089.1389090",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Thu Sep 4 14:23:10 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  acknowledgement = ack-nhfb,
  articleno =    "10",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Xu:2008:IDR,
  author =       "Tao Xu and Krishnendu Chakrabarty",
  title =        "Integrated droplet routing and defect tolerance in the
                 synthesis of digital microfluidic biochips",
  journal =      j-JETC,
  volume =       "4",
  number =       "3",
  pages =        "11:1--11:??",
  month =        aug,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1389089.1389091",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Thu Sep 4 14:23:10 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Microfluidic biochips are revolutionizing
                 high-throughput DNA sequencing, immunoassays, and
                 clinical diagnostics. As high-throughput bioassays are
                 mapped to digital microfluidic platforms, the need for
                 design automation techniques is being increasingly
                 felt. Moreover, as most applications of biochips are
                 safety-critical in nature, defect tolerance is an
                 essential system attribute. Several synthesis tools
                 have recently been proposed for the automated design of
                 biochips from the specifications of laboratory
                 protocols. However, only a few of these tools address
                 the problem of defect tolerance. In addition, most of
                 these methods do not consider the problem of droplet
                 routing in microfluidic arrays. These methods typically
                 rely on postsynthesis droplet routing to implement
                 biochemical protocols. Such an approach is not only
                 time consuming, but also imposes an undue burden on the
                 chip user. Postsynthesis droplet routing does not
                 guarantee that feasible droplet pathways can be found
                 for area-constrained biochip layouts; nonroutable
                 fabricated biochips must be discarded. We present a
                 synthesis tool that integrates defect tolerance and
                 droplet routing in the design flow. Droplet
                 routability, defined as the ease with which droplet
                 pathways can be determined, is estimated and integrated
                 in the synthesis procedure. Presynthesis and
                 postsynthesis defect-tolerance methods are also
                 presented. We use a large-scale protein assay as a case
                 study to evaluate the proposed synthesis method.",
  acknowledgement = ack-nhfb,
  articleno =    "11",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
  keywords =     "biochips; microfluidics; module placement; physical
                 design automation",
}

@Article{Huang:2008:RAF,
  author =       "Tsung-Ching Huang and Kwang-Ting (Tim) Cheng and
                 Huai-Yuan Tseng and Chen-Pang Kung",
  title =        "Reliability analysis for flexible electronics: {Case}
                 study of integrated {a-Si:H} {TFT} scan driver",
  journal =      j-JETC,
  volume =       "4",
  number =       "3",
  pages =        "12:1--12:??",
  month =        aug,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1389089.1389092",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Thu Sep 4 14:23:10 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Flexible electronics fabricated on thin-film,
                 lightweight, and bendable substrates (e.g., plastic)
                 have great potential for novel applications in consumer
                 electronics such as flexible displays, e-paper, and
                 smart labels; however, the key elements, namely
                 thin-film transistors (TFTs), for implementing flexible
                 circuits often suffer from electrical instability.
                 Therefore, thorough reliability analysis is critical
                 for flexible circuit design to ensure that the circuit
                 will operate reliably throughout its lifetime. In this
                 article we propose a methodology for reliability
                 simulation of hydrogenated amorphous silicon (a-Si:H)
                 TFT circuits. We show that: (1) the threshold voltage
                 ({\em V$_{TH}$ \/}) shift of a single TFT can be
                 estimated by analyzing its operating conditions; and
                 (2) the circuit lifetime can be predicted accordingly
                 by using SPICE-like simulators with proper modeling. We
                 also propose an algorithm to reduce the simulation time
                 by orders of magnitude, with good prediction accuracy.
                 To validate our analytical model and simulation
                 methodology, we compare simulation results with the
                 actual circuit measurements of an integrated a-Si:H TFT
                 scan driver fabricated on a glass substrate and we
                 demonstrate very good consistency.",
  acknowledgement = ack-nhfb,
  articleno =    "12",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
  keywords =     "amorphous hydrogenated silicon (a-Si:H); flexible
                 electronics; reliability; scan driver; thin-film
                 transistor; threshold voltage",
}

@Article{Li:2008:ADP,
  author =       "Jing Li and Aditya Bansal and Swarop Ghosh and Kaushik
                 Roy",
  title =        "An alternate design paradigm for low-power, low-cost,
                 testable hybrid systems using scaled {LTPS TFTs}",
  journal =      j-JETC,
  volume =       "4",
  number =       "3",
  pages =        "13:1--13:??",
  month =        aug,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1389089.1389093",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Thu Sep 4 14:23:10 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "This article presents a holistic hybrid design
                 methodology for low-power, low-cost, testable digital
                 designs using low-temperature polycrystalline-silicon
                 thin-film transistors (LTPS TFTs). An alternate scaling
                 rule under low thermal budget (due to flexible
                 substrate) is developed to improve the performance of
                 TFTs in the presence of process variation. We
                 demonstrate that LTPS TFTs can be further optimized for
                 ultralow-power subthreshold operation with performances
                 comparable to contemporary single-crystal
                 silicon-on-insulator (c-Si SOI) devices after process
                 optimization. The optimized LTPS TFTs with high current
                 drivability and less variability can comprise a
                 promising low-cost option to augment Si CMOS
                 technology, opening up a plethora of new hybrid 3D
                 applications. We illustrate one such application: IC
                 testing. Testing of complex VLSI systems is a prime
                 concern due to design cost of DFT circuits, area/delay
                 overheads, and poor test confidence. To harness the
                 benefits of TFT technology, a novel low-power,
                 process-tolerant, generic, and reconfigurable test
                 structure designed using LTPS TFTs is proposed to
                 reduce the test cost, as well as to improve
                 diagnosability and verifiability, of complex VLSI
                 systems. Due to proper optimization of TFT devices, the
                 proposed test structure consumes low power but operates
                 with reasonable performance. Furthermore, the test
                 circuits do not consume any silicon area because they
                 can be integrated on-chip using 3D technology. Since
                 the test architecture is reconfigurable, this
                 eliminates the need to redesign built-in-self-test
                 (BIST) components that may vary from one processor
                 generation to another. We have developed test
                 structures using 200nm TFT devices and evaluated them
                 on designs implemented in 130nm bulk CMOS. For circuit
                 simulations, we have developed a SPICE-compatible model
                 for TFT devices. The BIST components designed using the
                 test structures operate at 0.8--4.3 GHz (compared to
                 8.2 GHz in bulk CMOS) with low power consumption. The
                 enhanced scan cells partially implemented in TFT (3D
                 hybrid design) consume \sim 24\% less power and \sim
                 15--20\% less area of Si die compared to conventional
                 bulk-Si design (2D planar design), with minimal delay
                 overhead.",
  acknowledgement = ack-nhfb,
  articleno =    "13",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
  keywords =     "3D integration; BIST; DFT; generic; grain boundary
                 (GB); hybrid system; inherent variation;
                 low-temperature polycrystalline silicon (LTPS);
                 reconfigurable; thin-film transistor (TFT)",
}

@Article{Rad:2008:SNA,
  author =       "Reza Rad and Mohammad Tehranipoor",
  title =        "{SCT}: a novel approach for testing and configuring
                 nanoscale devices",
  journal =      j-JETC,
  volume =       "4",
  number =       "3",
  pages =        "14:1--14:??",
  month =        aug,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1389089.1389094",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Thu Sep 4 14:23:10 MDT 2008",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Novel strategies are necessary to efficiently test and
                 configure emerging reconfigurable nanoscale devices, in
                 addition to providing defect tolerance. This is mainly
                 due to the high defect densities that are expected for
                 these devices. Among different approaches,
                 reconfiguration-based defect avoidance has proven to be
                 a practical solution. However, configuration time, test
                 time, and defect-map size remain among the major
                 challenges for these new devices. In this article, we
                 propose a new approach (called SCT) that simultaneously
                 performs test and configuration. The proposed method
                 uses a built-in self-test (BIST) scheme for test and
                 defect tolerance. The method is based on testing
                 reconfigurable nanoblocks at the time of implementing a
                 function of a desired application on that block. The
                 SCT method considerably reduces the total test and
                 configuration time. It also eliminates the need for
                 storing the location of defects in a defect map on- or
                 off-chip. The presented probabilistic analysis results
                 show the effectiveness of this method in terms of test
                 and configuration time for architectures with rich
                 interconnect resources. Also, a Verilog simulation
                 model is developed for crossbar-based
                 nano-architectures. This model is used to implement
                 several MCNC benchmarks based on the proposed SCT
                 method. The simulation results demonstrate efficiency
                 of the method in terms of test time and yield under
                 different defect rates.",
  acknowledgement = ack-nhfb,
  articleno =    "14",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
  keywords =     "configuration and testing; crossbar; fault tolerance;
                 nanowire; reconfigurable nanoscale devices",
}

@Article{Xie:2008:ESI,
  author =       "Yuan Xie and Jason Cong and Paul Franzon",
  title =        "Editorial: {Special} issue on {$3$D} integrated
                 circuits and microarchitectures",
  journal =      j-JETC,
  volume =       "4",
  number =       "4",
  pages =        "15:1--15:??",
  month =        oct,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1412587.1412588",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Wed Mar 17 14:22:55 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  acknowledgement = ack-nhfb,
  articleno =    "15",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Kgil:2008:PUS,
  author =       "Taeho Kgil and Ali Saidi and Nathan Binkert and Steve
                 Reinhardt and Krisztian Flautner and Trevor Mudge",
  title =        "{PicoServer}: {Using} {$3$D} stacking technology to
                 build energy efficient servers",
  journal =      j-JETC,
  volume =       "4",
  number =       "4",
  pages =        "16:1--16:??",
  month =        oct,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1412587.1412589",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Wed Mar 17 14:22:55 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "This article extends our prior work to show that a
                 straightforward use of 3D stacking technology enables
                 the design of compact energy-efficient servers. Our
                 proposed architecture, called PicoServer, employs 3D
                 technology to bond one die containing several simple,
                 slow processing cores to multiple memory dies
                 sufficient for a primary memory. The multiple memory
                 dies are composed of DRAM. This use of 3D stacks
                 readily facilitates wide low-latency buses between
                 processors and memory. These remove the need for an L2
                 cache allowing its area to be re-allocated to
                 additional simple cores. The additional cores allow the
                 clock frequency to be lowered without impairing
                 throughput. Lower clock frequency means that thermal
                 constraints, a concern with 3D stacking, are easily
                 satisfied. We extend our original analysis on
                 PicoServer to include: (1) a wider set of server
                 workloads, (2) the impact of multithreading, and (3)
                 the on-chip DRAM architecture and system memory usage.
                 PicoServer is intentionally simple, requiring only the
                 simplest form of 3D technology where die are stacked on
                 top of one another. Our intent is to minimize risk of
                 introducing a new technology (3D) to implement a class
                 of low-cost, low-power compact server architectures.",
  acknowledgement = ack-nhfb,
  articleno =    "16",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
  keywords =     "3D stacking technology; chip multiprocessor;
                 full-system simulation; Low power; Tier-1/2/3 server",
}

@Article{Ma:2008:IEF,
  author =       "Yuchun Ma and Yongxiang Liu and Eren Kursun and Glenn
                 Reinman and Jason Cong",
  title =        "Investigating the effects of fine-grain
                 three-dimensional integration on microarchitecture
                 design",
  journal =      j-JETC,
  volume =       "4",
  number =       "4",
  pages =        "17:1--17:??",
  month =        oct,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1412587.1412590",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Wed Mar 17 14:22:55 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "In this article we propose techniques that enable
                 efficient exploration of the 3D design space, where
                 each logical block can span more than one silicon
                 layer. Fine-grain 3D integration provides reduced
                 intrablock wire delay as well as improved power
                 consumption. However, the corresponding power and
                 performance advantage is usually underutilized, since
                 various implementations of multilayer blocks require
                 novel physical design and microarchitecture
                 infrastructure to explore 3D microarchitecture design
                 space. We develop a cubic packing engine which can
                 simultaneously optimize physical and architectural
                 design for efficient vertical integration. This
                 technique selects the individual unit designs from a
                 set of single-layer or multilayer implementations to
                 get the best microarchitectural design in terms of
                 performance, temperature, or both. Our experimental
                 results using a design driver of a high-performance
                 superscalar processor show a 36\% performance
                 improvement over traditional 2D for 2--4 layers and
                 14\% over 3D with single-layer unit implementations.
                 Since thermal characteristics of 3D integrated circuits
                 are among the main challenges, thermal-aware
                 floorplanning and thermal via insertion techniques are
                 employed to keep the peak temperatures below
                 threshold.",
  acknowledgement = ack-nhfb,
  articleno =    "17",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
  keywords =     "3D integration; 3D packing; microarchitecture;
                 thermal",
}

@Article{Zhan:2008:AMA,
  author =       "Yong Zhan and Sachin S. Sapatnekar",
  title =        "Automated module assignment in stacked-{Vdd} designs
                 for high-efficiency power delivery",
  journal =      j-JETC,
  volume =       "4",
  number =       "4",
  pages =        "18:1--18:??",
  month =        oct,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1412587.1412591",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Wed Mar 17 14:22:55 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "With aggressive reductions in feature sizes and the
                 integration of multiple functionalities on the same
                 die, bottlenecks due to I/O pin limitations have become
                 a critical issue in today's VLSI designs, especially
                 for 3D IC technologies. To alleviate the pin limitation
                 problem, a stacked-Vdd circuit paradigm has recently
                 been proposed in the literature. However, for a circuit
                 designed using this paradigm, a significant amount of
                 power may be wasted if modules are not carefully
                 assigned to different Vdd domains. In this article, we
                 present a partition-based algorithm for efficiently
                 assigning modules at the floorplanning level, so as to
                 reuse currents between Vdd domains and minimize the
                 power wasted during the operation of the circuit.
                 Experimental results on both 3D and 2D ICs show that
                 compared with assigning modules to different Vdd
                 domains using enumeration and simulated annealing, our
                 algorithm can generate circuits with competitive power
                 and IR noise performance, while being orders of
                 magnitude faster.",
  acknowledgement = ack-nhfb,
  articleno =    "18",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Ferri:2008:PYM,
  author =       "Cesare Ferri and Sherief Reda and R. Iris Bahar",
  title =        "Parametric yield management for {$3$D} {ICs}: {Models}
                 and strategies for improvement",
  journal =      j-JETC,
  volume =       "4",
  number =       "4",
  pages =        "19:1--19:??",
  month =        oct,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1412587.1412592",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Wed Mar 17 14:22:55 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Three-Dimensional (3D) Integrated Circuits (ICs) that
                 integrate die with Through-Silicon Vias (TSVs) promise
                 to continue system and functionality scaling beyond the
                 traditional geometric 2D device scaling. 3D integration
                 also improves the performance of ICs by reducing the
                 communication time between different chip components
                 through the use of short TSV-based vertical wires. This
                 reduction is particularly attractive in processors
                 where it is desirable to reduce the access time between
                 the main logic die and the L2 cache or the main memory
                 die. Process variations in 2D ICs lead to a drop in
                 parametric yield (as measured by speed, leakage and
                 sales profits), which forces manufacturers to speed bin
                 their chips and to sell slow chips at reduced prices.
                 In this paper we develop a model to quantify the impact
                 of process variations on the parametric yield of 3D
                 ICs, and then we propose a number of integration
                 strategies that use a graph-theoretic framework to
                 maximize the performance, parametric yield and profits
                 of 3D ICs. Comparing our proposed strategies to current
                 yield-oblivious methods, it is demonstrated that it is
                 possible to increase the number of 3D ICs in the
                 fastest speed bins by almost $ 2 \times $, while
                 simultaneously reducing the number of slow ICs by
                 29.4\%. This leads to an improvement in performance by
                 up to 6.45\% and an increase of about 12.48\% in total
                 sales revenue using up-to-date market price models.",
  acknowledgement = ack-nhfb,
  articleno =    "19",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
  keywords =     "3D integration; leakage; performance; process
                 variations; yield management",
}

@Article{Miyakawa:2008:MST,
  author =       "Nobuaki Miyakawa and Eiri Hashimoto and Takanori
                 Maebashi and Natsuo Nakamura and Yutaka Sacho and
                 Shigeto Nakayama and Shinjiro Toyoda",
  title =        "Multilayer stacking technology using wafer-to-wafer
                 stacked method",
  journal =      j-JETC,
  volume =       "4",
  number =       "4",
  pages =        "20:1--20:??",
  month =        oct,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1412587.1412593",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Wed Mar 17 14:22:55 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "We have developed a new three-dimensional stacking
                 technology using the wafer-to-wafer stacked method.
                 Electrical conductivity between each wafer is almost
                 100\% and contact resistance is less than 0.7\Omega
                 between a through-silicon via (TSV) and a microbump. We
                 have also created a prototype of a three-layer stacking
                 device using our technology, where each wafer for the
                 stacking is fabricated by using 0.18um CMOS technology
                 based on 8-inch wafers. The device is operated by two
                 times the frequency of the multichip module (MCM)
                 device case using a two-dimensional device with
                 identical functions and minimally different power
                 consumption. The yields obtained from the results
                 comprising all functional tests are over 60\%.",
  acknowledgement = ack-nhfb,
  articleno =    "20",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
  keywords =     "3D integration; design; hardware; stacking process",
}

@Article{Shukla:2009:GEI,
  author =       "Sandeep Shukla",
  title =        "Guest editorial: {IEEE\slash ACM} Symposium on
                 Nanoscale Architectures {(NANOARCH07)}",
  journal =      j-JETC,
  volume =       "5",
  number =       "1",
  pages =        "1:1--1:??",
  month =        jan,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1482613.1482614",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Wed Mar 17 14:23:14 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  acknowledgement = ack-nhfb,
  articleno =    "1",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Wang:2009:TAR,
  author =       "Shuo Wang and Lei Wang and Faquir Jain",
  title =        "Towards achieving reliable and high-performance
                 nanocomputing via dynamic redundancy allocation",
  journal =      j-JETC,
  volume =       "5",
  number =       "1",
  pages =        "2:1--2:??",
  month =        jan,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1482613.1482615",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Wed Mar 17 14:23:14 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Nanoelectronic devices are considered to be the
                 computational fabrics for the emerging nanocomputing
                 systems due to their ultra-high speed and integration
                 density. However, the imperfect bottom-up self-assembly
                 fabrication leads to excessive defects that have become
                 a barrier for achieving reliable computing. In
                 addition, transient errors continue to be a problem.
                 The massive parallelism rendered by nanoscale
                 integration opens up new opportunities but also poses
                 challenges on how to manage such massive resources for
                 reliable and high-performance computing. In this paper,
                 we propose a nanoarchitecture solution to address these
                 emerging challenges. By using dynamic redundancy
                 allocation, the massive parallelism is exploited to
                 jointly achieve fault (defect/error) tolerance and high
                 performance. Simulation results demonstrate the
                 effectiveness of the proposed technique under a range
                 of fault rates and operating conditions.",
  acknowledgement = ack-nhfb,
  articleno =    "2",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
  keywords =     "hardware reliability; Nanoscale architecture;
                 performance; redundancy allocation; redundant design",
}

@Article{Wang:2009:ENP,
  author =       "Z. F. Wang and Huaixiu Zheng and Q. W. Shi and Jie
                 Chen",
  title =        "Emerging nanodevice paradigm: {Graphene-based}
                 electronics for nanoscale computing",
  journal =      j-JETC,
  volume =       "5",
  number =       "1",
  pages =        "3:1--3:??",
  month =        jan,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1482613.1482616",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Wed Mar 17 14:23:14 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "The continued miniaturization of silicon-based
                 electronic circuits is fast approaching its physical
                 limitations. It is unlikely that advances in
                 miniaturization, following the so-called Moore's Law,
                 can continue in the foreseeable future. Nanoelectronics
                 has to go beyond silicon technology. New device
                 paradigms based on nanoscale materials, such as
                 molecular electronic devices, spin devices and
                 carbon-based devices, will emerge. In this article, we
                 introduce a nanodevice paradigm: graphene
                 nanoelectronics. Due to its unique quantum effects and
                 electronic properties, researchers predict that
                 graphene-based devices may replace carbon nanotube
                 devices and become major building blocks for future
                 nanoscale computing. To manifest its unique electronic
                 properties, we present some of our recent designs,
                 namely a graphene-based switch, a negative differential
                 resistance (NDR) device and a random access memory
                 array (RAM). Since these basic devices are the building
                 blocks for large-scale circuits, our findings can help
                 researchers construct useful computing systems and
                 study graphene-based circuit performance in the
                 future.",
  acknowledgement = ack-nhfb,
  articleno =    "3",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
  keywords =     "Graphene device; memory structure; negative
                 differential resistance; tight-binding model",
}

@Article{Taskin:2009:SRB,
  author =       "Baris Taskin and Andy Chiu and Jonathan Salkind and
                 Daniel Venutolo",
  title =        "A shift-register-based {QCA} memory architecture",
  journal =      j-JETC,
  volume =       "5",
  number =       "1",
  pages =        "4:1--4:??",
  month =        jan,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1482613.1482617",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Wed Mar 17 14:23:14 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "A quantum-dot cellular automata (QCA) design of an $ n
                 \times m$-bit, shift-register-based memory architecture
                 is presented. The architecture maintains data at a
                 stable conformation, which is contrary to traditional
                 data in-motion concept for QCA architectures. The
                 memory architecture is based on an existing
                 dual-phase-synchronized, line-based, one-bit QCA memory
                 cell building block that provides size and latency
                 improvements over other known one-bit memory cells
                 through its novel clocking scheme. Read/write latencies
                 up to \sim 2X lower than the existing tile-based
                 architecture with three-phase, line-based memory cells
                 are obtained. Simulations with QCADesigner and HDLQ are
                 performed on a sample $ 4 \times 8$ bit memory
                 architecture implementation.",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
  keywords =     "clocking; memory design; Quantum-dot cellular
                 automata",
}

@Article{Huo:2009:SBN,
  author =       "Dennis Huo and Qiaoyan Yu and David Wolpert and Paul
                 Ampadu",
  title =        "A simulator for ballistic nanostructures in a {$2$-D}
                 electron gas",
  journal =      j-JETC,
  volume =       "5",
  number =       "1",
  pages =        "5:1--5:??",
  month =        jan,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1482613.1482618",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Wed Mar 17 14:23:14 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "A multipurpose simulator for ballistic nanostructures,
                 based on classical mechanics of electrons at the Fermi
                 level, has been successfully implemented. Despite the
                 simplicity of the model, the simulator successfully
                 reproduces a number of experimental results, and is
                 shown to consistently match observed current-voltage
                 characteristics and magnetoresistance phenomena. The
                 simulator results provide design guidelines for devices
                 which operate on ballistic transport principles. Using
                 the simulator, preliminary logic structures have been
                 designed based on the ballistic deflection
                 transistor.",
  acknowledgement = ack-nhfb,
  articleno =    "5",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
  keywords =     "2DEG; Ballistic transport; nanoelectronic device;
                 transistor",
}

@Article{Bahar:2009:ISS,
  author =       "R. Iris Bahar",
  title =        "Introduction to special section: {Best} of {NANOARCH
                 2008}",
  journal =      j-JETC,
  volume =       "5",
  number =       "2",
  pages =        "6:1--6:??",
  month =        jul,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1543438.1543439",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Wed Mar 17 14:23:24 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  acknowledgement = ack-nhfb,
  articleno =    "6",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Mishra:2009:LPF,
  author =       "Prateek Mishra and Anish Muttreja and Niraj K. Jha",
  title =        "Low-power {FinFET} circuit synthesis using multiple
                 supply and threshold voltages",
  journal =      j-JETC,
  volume =       "5",
  number =       "2",
  pages =        "7:1--7:??",
  month =        jul,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1543438.1543440",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Wed Mar 17 14:23:24 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "According to Moore's law, the number of transistors in
                 a chip doubles every 18 months. The increased
                 transistor-count leads to increased power density.
                 Thus, in modern circuits, power efficiency is a central
                 determinant of circuit efficiency. With scaling,
                 leakage power accounts for an increasingly larger
                 portion of the total power consumption in deep
                 submicron technologies (>40\%).\par

                 FinFET technology has been proposed as a promising
                 alternative to deep submicron bulk CMOS technology,
                 because of its better scalability, short-channel
                 characteristics, and ability to suppress leakage
                 current and mitigate device-to-device variability when
                 compared to bulk CMOS. The subthreshold slope of a
                 FinFET is approximately 60mV which is close to
                 ideal.\par

                 In this article, we propose a methodology for low-power
                 FinFET based circuit synthesis. A mechanism called TCMS
                 (Threshold Control through Multiple Supply Voltages)
                 was previously proposed for improving the power
                 efficiency of FinFET based global interconnects. We
                 propose a significant generalization of TCMS to the
                 design of any logic circuit. This scheme represents a
                 significant divergence from the conventional multiple
                 supply voltage schemes considered in the past. It also
                 obviates the need for voltage level-converters. We
                 employ accurate delay and power estimates using table
                 look-up methods based on HSPICE simulations for supply
                 voltage and threshold voltage optimization.
                 Experimental results demonstrate that TCMS can provide
                 power savings of 67.6\% and device area savings of
                 65.2\% under relaxed delay constraints. Two other
                 variants of TCMS are also proposed that yield similar
                 benefits. We compare our scheme to extended cluster
                 voltage scaling (ECVS), a popular dual- {\em V$_{dd}$
                 \/} scheme presented in the literature. ECVS makes use
                 of voltage level-converters. Even when it is assumed
                 that these level-converters have zero delay, thus
                 significantly favoring ECVS in time-constrained power
                 optimization, TCMS still outperforms ECVS.",
  acknowledgement = ack-nhfb,
  articleno =    "7",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
  keywords =     "linear programming; Low-power; synthesis; TCMS",
}

@Article{Crocker:2009:DFQ,
  author =       "Michael Crocker and X. Sharon Hu and Michael Niemier",
  title =        "Defects and faults in {QCA}-based {PLAs}",
  journal =      j-JETC,
  volume =       "5",
  number =       "2",
  pages =        "8:1--8:??",
  month =        jul,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1543438.1543441",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Wed Mar 17 14:23:24 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Defect tolerance will be critical in any system with
                 nanoscale feature sizes. This article examines some
                 fundamental aspects of defect tolerance for a
                 reconfigurable system based on Quantum-dot Cellular
                 Automata (QCA). We analyze a novel, QCA-based,
                 Programmable Logic Array (PLA) structure, develop an
                 implementation independent fault model, and discuss how
                 expected defects and faults might affect yield. Within
                 this context, we introduce techniques for mapping
                 Boolean logic functions to a defective QCA-based PLA.
                 Simulation results show that our new mapping techniques
                 can achieve higher yields than existing techniques.",
  acknowledgement = ack-nhfb,
  articleno =    "8",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
  keywords =     "defects; faults; logic mapping; Nanotechnology;
                 quantum-dot cellular automata",
}

@Article{Wu:2009:SCD,
  author =       "Xiaoxia Wu and Paul Falkenstern and Krishnendu
                 Chakrabarty and Yuan Xie",
  title =        "Scan-chain design and optimization for
                 three-dimensional integrated circuits",
  journal =      j-JETC,
  volume =       "5",
  number =       "2",
  pages =        "9:1--9:??",
  month =        jul,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1543438.1543442",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Wed Mar 17 14:23:24 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Scan chains are widely used to improve the testability
                 of integrated circuit (IC) designs and to facilitate
                 fault diagnosis. For traditional 2D IC design, a number
                 of design techniques have been proposed in the
                 literature for scan-chain routing and scan-cell
                 partitioning. However, these techniques are not
                 effective for three-dimensional (3D) technologies,
                 which have recently emerged as a promising means to
                 continue technology scaling. In this article, we
                 propose two techniques for designing scan chains in 3D
                 ICs, with given constraints on the number of
                 through-silicon-vias (TSVs). The first technique is
                 based on a genetic algorithm (GA), and it addresses the
                 ordering of cells in a single scan chain. The second
                 optimization technique is based on integer linear
                 programming (ILP); it addresses single-scan-chain
                 ordering as well as the partitioning of scan flip-flops
                 into multiple scan chains. We compare these two methods
                 by conducting experiments on a set of ISCAS'89
                 benchmark circuits. The first conclusion obtained from
                 the results is that 3D scan-chain optimization achieves
                 significant wire-length reduction compared to 2D
                 counterparts. The second conclusion is that the
                 ILP-based technique provides lower bounds on the
                 scan-chain interconnect length for 3D ICs, and it
                 offers considerable reduction in wire-length compared
                 to the GA-based heuristic method.",
  acknowledgement = ack-nhfb,
  articleno =    "9",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
  keywords =     "3D ICs; genetic algorithm; integer linear programming;
                 LP relaxation; randomized rounding; scan-chain design",
}

@Article{Datta:2009:EPT,
  author =       "Siddhartha Datta and Bharat Joshi and Arun Ravindran
                 and Arindam Mukherjee",
  title =        "Efficient parallel testing and diagnosis of digital
                 microfluidic biochips",
  journal =      j-JETC,
  volume =       "5",
  number =       "2",
  pages =        "10:1--10:??",
  month =        jul,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1543438.1543443",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Wed Mar 17 14:23:24 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Microfluidics-based biochips consist of microfluidic
                 arrays on rigid substrates through which movement of
                 fluids is tightly controlled to facilitate biological
                 reactions. Biochips are soon expected to revolutionize
                 biosensing, clinical diagnostics, environmental
                 monitoring, and drug discovery. Critical to the
                 deployment of the biochips in such diverse areas is the
                 dependability of these systems. Thus robust testing and
                 diagnosis techniques are required to ensure adequate
                 level of system dependability. Due to the underlying
                 mixed technology and mixed energy domains, such
                 biochips exhibit unique failure mechanisms and defects.
                 In this article efficient parallel testing and
                 diagnosis algorithms are presented that can detect and
                 locate single as well as multiple faults in a
                 microfluidic array without flooding the array, a
                 problem that has hampered realistic implementation of
                 several existing strategies. The fault diagnosis
                 algorithms are well suited for built-in self-test that
                 could drastically reduce the operating cost of
                 microfluidic biochip. Also, the proposed alogirthms can
                 be used both for testing and fault diagnosis during
                 field operation as well as increasing yield during the
                 manufacturing phase of the biochip. Furthermore, these
                 algorithms can be applied to both online and offline
                 testing and diagnosis. Analytical results suggest that
                 these strategies that can be used to design highly
                 dependable biochip systems.",
  acknowledgement = ack-nhfb,
  articleno =    "10",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
  keywords =     "defect tolerance; droplet flooding; fault tolerance;
                 Microfluidic biochip; microfluidics; multiple faults;
                 reconfigurability; testing",
}

@Article{Tahoori:2009:LOD,
  author =       "Mehdi B. Tahoori",
  title =        "Low-overhead defect tolerance in crossbar
                 nanoarchitectures",
  journal =      j-JETC,
  volume =       "5",
  number =       "2",
  pages =        "11:1--11:??",
  month =        jul,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1543438.1543444",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Wed Mar 17 14:23:24 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "It is anticipated that the number of defects in
                 nanoscale devices fabricated using bottom-up
                 self-assembly process is significantly higher than that
                 for CMOS devices fabricated by conventional top-down
                 lithography patterning. This is mainly because of
                 inherent lack of control in self-assembly fabrication
                 as well as atomic scale of devices. The goal of defect
                 tolerance, as an integral part of nano computing, is to
                 obtain error-free computation from such fabrics
                 containing defective elements.\par

                 In this article, an application-independent defect
                 tolerant scheme for reconfigurable crossbar array
                 nanoarchitectures is presented. The main feature of
                 this approach is that the existence and location of
                 defective resources within the nano-fabric are hidden
                 from the entire design flow, resulting in minimum
                 post-fabrication customization per chip and minimum
                 changes to the entire design and synthesis flow. It is
                 also shown how to drastically minimize the area
                 overhead associated with this flow. The proposed
                 technique requires extraction of regular yet incomplete
                 defect-free subsets, in contrast to previously proposed
                 complete defect-free subsets. This can greatly reduce
                 the area overhead required for defect tolerance while
                 not sacrificing logic mapping or signal routing
                 capabilities. Extensive simulation results confirm
                 considerable reduction in the area overhead without any
                 negative impact on the usability of modified
                 defect-free subsets.",
  acknowledgement = ack-nhfb,
  articleno =    "11",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
  keywords =     "Defect tolerance; nanotechnology; reconfigurable
                 architectures",
}

@Article{Chakraborty:2009:SAD,
  author =       "Rajat Subhra Chakraborty and Swarup Bhunia",
  title =        "A study of asynchronous design methodology for robust
                 {CMOS}-nano hybrid system design",
  journal =      j-JETC,
  volume =       "5",
  number =       "3",
  pages =        "12:1--12:??",
  month =        aug,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1568485.1568486",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Wed Mar 17 14:23:41 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Among the emerging alternatives to CMOS, molecular
                 electronics based diode-resistor crossbar fabric has
                 generated considerable interest in recent times. Logic
                 circuit design with future nano-scale molecular devices
                 using dense and regular crossbar fabrics is promising
                 in terms of integration density, performance and power
                 dissipation. However, circuit design using molecular
                 switches involve some major challenges: (1) lack of
                 voltage gain of these switches that prevents logic
                 cascading; (2) large output voltage level degradation;
                 (3) vulnerability to parameter variations that affect
                 yield and robustness of operation; and (4) high defect
                 rate. In this article, we analyze some of the above
                 challenges and investigate the effectiveness of
                 asynchronous design methodology in a hybrid system
                 design platform using molecular crossbar and CMOS
                 interfacing elements. We explore different approaches
                 of asynchronous circuit design and compare their
                 suitability in terms of several circuit design
                 parameters. We then develop the methodology and an
                 automated synthesis flow to support two different
                 asynchronous design approaches ({\em Micropipelines\/}
                 and {\em Four phase Dual-rail\/}) for system designs
                 using nano-crossbar logic stages and CMOS interface
                 data-storage elements. Circuit-level simulation results
                 for several benchmarks show considerable advantage in
                 terms of performance and robustness at moderate area
                 and power overhead compared to two different
                 synchronous implementations.",
  acknowledgement = ack-nhfb,
  articleno =    "12",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
  keywords =     "Asynchronous design; CMOS-nano co-design; dual-rail
                 circuits; logic degradation; micropipelines; nano-scale
                 crossbar; robust design",
}

@Article{Zhang:2009:HNCa,
  author =       "Wei Zhang and Niraj K. Jha and Li Shang",
  title =        "A hybrid {Nano\slash CMOS} dynamically reconfigurable
                 system --- {Part II}: {Design} optimization flow",
  journal =      j-JETC,
  volume =       "5",
  number =       "3",
  pages =        "13:1--13:??",
  month =        aug,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1568485.1568487",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Wed Mar 17 14:23:41 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "In Part I of this work, a hybrid nano/CMOS
                 reconfigurable architecture, called NATURE, was
                 described. It is composed of CMOS reconfigurable logic
                 and interconnect fabric, and nonvolatile nano on-chip
                 memory. Through its support for cycle-by-cycle runtime
                 reconfiguration and a highly-efficient computation
                 model, temporal logic folding, NATURE improves logic
                 density and area-delay product by more than an order of
                 magnitude compared to existing CMOS-based
                 field-programmable gate arrays (FPGAs). NATURE can be
                 fabricated using mainstream photo-lithography
                 fabrication techniques. Thus, it offers a currently
                 commercially feasible architecture with high
                 performance, superior logic density, and excellent
                 runtime design flexibility.\par

                 In Part II of this work, we present an integrated
                 design and optimization flow for NATURE, called
                 NanoMap. Given an input design specified in
                 register-transfer level (RTL) and/or gate-level VHDL,
                 NanoMap optimizes and implements the design on NATURE
                 through logic mapping, temporal clustering, temporal
                 placement, and routing. As opposed to other design
                 tools for traditional FPGAs, NanoMap supports and
                 leverages temporal logic folding by integrating novel
                 mapping techniques. It can automatically explore and
                 identify the best temporal logic folding configuration,
                 targeting area, delay or area-delay product
                 optimization. A force-directed scheduling technique is
                 used to optimize and balance resource usage across
                 different folding cycles. By supporting logic folding,
                 NanoMap can provide significant design flexibility in
                 performing area-delay trade-offs under various
                 user-specified constraints. We present details of the
                 mapping procedure and results for different
                 architectural instances. Experimental results
                 demonstrate that NanoMap can judiciously trade off area
                 and delay targeting different optimization goals, and
                 effectively exploit the advantages of NATURE.\par

                 Part I of this work will appear in JETC Vol. 5, No.
                 4.",
  acknowledgement = ack-nhfb,
  articleno =    "13",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
  keywords =     "design optimization flow; Dynamic reconfiguration;
                 logic folding; NATURE",
}

@Article{Simsir:2009:HNC,
  author =       "Muzaffer O. Simsir and Srihari Cadambi and Franjo
                 Ivanv{\v{c}}i{\'c} and Martin Roetteler and Niraj K.
                 Jha",
  title =        "A hybrid nano-{CMOS} architecture for defect and fault
                 tolerance",
  journal =      j-JETC,
  volume =       "5",
  number =       "3",
  pages =        "14:1--14:??",
  month =        aug,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1568485.1568488",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Wed Mar 17 14:23:41 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "As the end of the semiconductor roadmap for CMOS
                 approaches, architectures based on nanoscale molecular
                 devices are attracting attention. Among several
                 alternatives, silicon nanowires and carbon nanotubes
                 are the two most promising nanotechnologies according
                 to the ITRS. These technologies may enable scaling deep
                 into the nanometer regime. However, they suffer from
                 very defect-prone manufacturing processes. Although the
                 reconfigurability property of the nanoscale devices can
                 be used to tolerate high defect rates, it may not be
                 possible to locate all defects. With very high device
                 densities, testing each component may not be possible
                 because of time or technology restrictions. This points
                 to a scenario in which even though the devices are
                 tested, the tests are not very comprehensive at
                 locating defects, and hence the shipped chips are still
                 defective. Moreover, the devices in the nanometer range
                 will be susceptible to transient faults which can
                 produce arbitrary soft errors. Despite these drawbacks,
                 it is possible to make nanoscale architectures
                 practical and realistic by introducing defect and fault
                 tolerance. In this article, we propose and evaluate a
                 hybrid nanowire-CMOS architecture that addresses all
                 three problems --- namely high defect rates, unlocated
                 defects, and transient faults --- at the same time.
                 This goal is achieved by using multiple levels of
                 redundancy and majority voters. A key aspect of the
                 architecture is that it contains a judicious balance of
                 both nanoscale and traditional CMOS components. A
                 companion to the architecture is a compiler with
                 heuristics to quickly determine if logic can be mapped
                 onto partially defective nanoscale elements. The
                 heuristics make it possible to introduce
                 defect-awareness in placement and routing. The
                 architecture and compiler are evaluated by applying the
                 complete design flow to several benchmarks.",
  acknowledgement = ack-nhfb,
  articleno =    "14",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
  keywords =     "Defect tolerance; nanotechnology; nanowires",
}

@Article{Wang:2009:UQD,
  author =       "Shuo Wang and Jianwei Dai and El-Sayed Hasaneen and
                 Lei Wang and Faquir Jain",
  title =        "Utilizing quantum dot transistors with programmable
                 threshold voltages for low-power mobile computing",
  journal =      j-JETC,
  volume =       "5",
  number =       "3",
  pages =        "15:1--15:??",
  month =        aug,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1568485.1568489",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Wed Mar 17 14:23:41 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Power consumption poses one of the fundamental
                 barriers for deploying mobile computing devices in
                 energy-constrained situations with varying operation
                 conditions. In particular, leakage power is projected
                 to increase exponentially in future semiconductor
                 process nodes. This challenging problem is pressing for
                 renewed focus on power-performance optimization at all
                 levels of design abstract, from novel device structures
                 to fundamental shifts in design paradigm. In this
                 article, we propose to exploit the programmable
                 threshold voltage quantum dot (QD) transistors to
                 reduce leakage thereby improving the energy efficiency
                 for mobile computing. The unique programmability and
                 reconfigurability enabled by QD transistors extend our
                 capability in design optimization for new
                 power-performance trade-offs. Simulation results
                 demonstrate the significant leakage reduction over
                 conventional techniques.",
  acknowledgement = ack-nhfb,
  articleno =    "15",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
  keywords =     "Low power; threshold voltage and quantum dot
                 transistor",
}

@Article{Zhang:2009:HNCb,
  author =       "Wei Zhang and Niraj K. Jha and Li Shang",
  title =        "A hybrid {nano\slash CMOS} dynamically reconfigurable
                 system --- {Part I}: {Architecture}",
  journal =      j-JETC,
  volume =       "5",
  number =       "4",
  pages =        "16:1--16:??",
  month =        nov,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1629091.1629092",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Wed Mar 17 14:23:55 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Rapid progress on nanodevices points to a promising
                 direction for future circuit design. However, since
                 nanofabrication techniques are not yet mature,
                 implementation of nanocircuits, at least on a large
                 scale, in the near future is infeasible. To ease
                 fabrication and overcome the problem of high defect
                 levels in nanotechnology, hybrid nano/CMOS
                 reconfigurable architectures are attractive choices.
                 Moreover, if the current photolithography fabrication
                 process can be used to manufacture the hybrid chips,
                 the benefits of nanotechnologies can be realized
                 today.\par

                 Traditional reconfigurable architectures can only
                 support partial or coarse-grain runtime reconfiguration
                 due to their limited on-chip storage and long off-chip
                 reconfiguration latency. Recent progress on nano Random
                 Access Memories (RAMs), such as carbon nanotube-based
                 RAM (NRAM), Phase-Change Memory (PCM), magnetoresistive
                 RAM (MRAM), etc., provides us with a chance to realize
                 on-chip fine-grain runtime reconfiguration. These nano
                 RAMs have good compatibility with the current
                 fabrication process. By utilizing them in the hybrid
                 design, we can take advantage of both CMOS and
                 nanotechnology, and greatly improve the logic density,
                 resource utilization, and performance of our
                 design.\par

                 In this article, we propose a high-performance
                 reconfigurable architecture, called NATURE, that
                 utilizes CMOS logic and nano RAMs. An automatic design
                 flow for NATURE is presented in Part II of the article.
                 In NATURE, the highly dense nonvolatile nano RAMs are
                 distributed throughout the chip to allow large embedded
                 on-chip configuration storage, which enables fast
                 reading and hence supports fine-grain runtime
                 reconfiguration and temporal logic folding of a circuit
                 before being mapped to the architecture. Temporal logic
                 folding can significantly increase the logic density of
                 NATURE (by over an order of magnitude for large
                 circuits) while remaining competitive in performance
                 and power consumption. For ease of exposition, we use
                 NRAMs to illustrate various concepts in this article
                 due to the excellent properties of NRAMs. However,
                 other nano RAMs can also be used instead. Experimental
                 results based on NRAMs establish the efficacy of
                 NATURE.",
  acknowledgement = ack-nhfb,
  articleno =    "16",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
  keywords =     "logic folding; NRAM; runtime reconfiguration",
}

@Article{Zhang:2009:DSE,
  author =       "Wei Zhang and Niraj K. Jha and Li Shang",
  title =        "Design space exploration and data memory architecture
                 design for a hybrid {nano\slash CMOS} dynamically
                 reconfigurable architecture",
  journal =      j-JETC,
  volume =       "5",
  number =       "4",
  pages =        "17:1--17:??",
  month =        nov,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1629091.1629093",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Wed Mar 17 14:23:55 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "In recent years, research on nanotechnology has
                 advanced rapidly. Novel nanodevices have been
                 developed, such as those based on carbon nanotubes,
                 nanowires, etc. Using these emerging nanodevices,
                 diverse nanoarchitectures have been proposed. Among
                 them, hybrid nano/CMOS reconfigurable architectures
                 have attracted attention because of their advantages in
                 performance, integration density, and fault tolerance.
                 Recently, a high-performance hybrid nano/CMOS
                 reconfigurable architecture, called NATURE, was
                 presented. NATURE comprises CMOS reconfigurable logic
                 and interconnect fabric, and
                 CMOS-fabrication-compatible nanomemory. High-density,
                 fast nano RAMs are distributed in NATURE as on-chip
                 storage to store multiple reconfiguration copies for
                 each reconfigurable element. It enables cycle-by-cycle
                 runtime reconfiguration and a highly efficient
                 computational model, called temporal logic folding.
                 Through logic folding, NATURE provides more than an
                 order of magnitude improvement in logic density and
                 area-delay product, and significant design flexibility
                 in performing area-delay trade-offs, at the same
                 technology node. Moreover, NATURE can be fabricated
                 using mainstream photolithography fabrication
                 techniques. Hence, it offers a currently commercially
                 viable reconfigurable architecture with high
                 performance, superior logic density, and outstanding
                 design flexibility, which is very attractive for
                 deployment in cost-conscious embedded systems.\par

                 In order to fully explore the potential of NATURE and
                 further improve its performance, in this article, a
                 thorough design space exploration is conducted to
                 optimize its architecture. Investigations in terms of
                 different logic element architectures, interconnect
                 designs, and various technologies for nano RAMs are
                 presented. Nano RAMs can not only be used as storage
                 for configuration bits, but the high density of nano
                 RAMs also makes them excellent candidates for
                 large-capacity on-chip data storage in NATURE. Many
                 logic- and memory-intensive applications, such as video
                 and image processing, require large storage of temporal
                 results. To enhance the capability of NATURE for
                 implementing such applications, we investigate the
                 design of nano data memory structures in NATURE and
                 explore the impact of memory density. Experimental
                 results demonstrate significant throughput improvements
                 due to area saving from logic folding and parallel data
                 processing.",
  acknowledgement = ack-nhfb,
  articleno =    "17",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
  keywords =     "logic folding; Nano data RAM; runtime
                 reconfiguration",
}

@Article{Tang:2009:DET,
  author =       "Weiguo Tang and Lei Wang and Fabrizio Lombardi",
  title =        "A defect\slash error-tolerant nanosystem architecture
                 for {DSP}",
  journal =      j-JETC,
  volume =       "5",
  number =       "4",
  pages =        "18:1--18:??",
  month =        nov,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1629091.1629094",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Wed Mar 17 14:23:55 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Emerging technologies such as silicon NanoWires (NW)
                 and Carbon NanoTubes (CNT) have shown great potential
                 for building the next generation of computing systems
                 in the nano ranges. However, the excessive number of
                 defects originating from bottom-up fabrication (such as
                 a self-assembly process) poses a pressing challenge for
                 achieving scalable system integration. This article
                 proposes a new nanosystem architecture that employs
                 nanowire crossbars for Digital Signal Processing (DSP)
                 applications. Distributed arithmetic is utilized such
                 that complex signal processing computation can be
                 mapped into regular memory operations, thus making this
                 architecture well suited for implementation by nanowire
                 crossbars. Furthermore, the inherent features of
                 DSP-type computation provide new insights to remedy
                 errors (as logic/computational manifestation of
                 defects). A new defect/error-tolerant technique that
                 exploits algorithmic error compensation is proposed; at
                 system level different trade-offs between correctness
                 in output and performance are established while
                 retaining low overhead in its implementation. As an
                 instance of its application, the proposed approach has
                 been utilized to a generic DSP nanosystem performing
                 frequency-selective filtering. Simulation results show
                 that the proposed nanoDSP introduces only a minor
                 performance degradation under high defect rates and at
                 a range of operational conditions. The proposed
                 technique also features good scalability and viability
                 for various DSP applications.",
  acknowledgement = ack-nhfb,
  articleno =    "18",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
  keywords =     "algorithmic error compensation; Distributed
                 arithmetic; DSP nanosystem; inner product",
}

@Article{Dysart:2009:OWR,
  author =       "Timothy J. Dysart and Peter M. Kogge",
  title =        "Organizing wires for reliability in magnetic {QCA}",
  journal =      j-JETC,
  volume =       "5",
  number =       "4",
  pages =        "19:1--19:??",
  month =        nov,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1629091.1629095",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Wed Mar 17 14:23:55 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "This article investigates, via analytic modeling, how
                 a magnetic QCA wire should be organized to provide the
                 highest reliability. We compare a nonredundant wire and
                 two redundant wire organizations. For all three
                 organizations, a fault rate per unit length is used for
                 comparison; additionally, since extra components are
                 necessary to implement the redundant organizations,
                 these components are faulty as well. We show that the
                 difference between these two fault rates is the main
                 driver for selecting a wire organization. Lastly, we
                 develop a guideline for selecting the most reliable
                 wire organization during the circuit design process.",
  acknowledgement = ack-nhfb,
  articleno =    "19",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
  keywords =     "modular redundancy; nanomagnet logic; QCA",
}

@Article{Chakrabarty:2010:E,
  author =       "Krishnendu Chakrabarty",
  title =        "Editorial",
  journal =      j-JETC,
  volume =       "6",
  number =       "1",
  pages =        "1:1--1:??",
  month =        mar,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1721650.1721651",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Wed Mar 17 14:24:05 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  acknowledgement = ack-nhfb,
  articleno =    "1",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Lee:2010:FBP,
  author =       "Chun-Yi Lee and Niraj K. Jha",
  title =        "{FinFET}-based power simulator for interconnection
                 networks",
  journal =      j-JETC,
  volume =       "6",
  number =       "1",
  pages =        "2:1--2:??",
  month =        mar,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1721650.1721652",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Wed Mar 17 14:24:05 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Double-gate FETs, specifically FinFETs, are emerging
                 as promising substitutes for bulk CMOS at the 32nm
                 technology node and beyond because of the various
                 obstacles to scaling faced by CMOS, such as
                 short-channel effects, leakage power, and process
                 variations. Another trend in chip multiprocessor design
                 is incorporation of sophisticated on-chip
                 interconnection networks. However, such networks are
                 significant power-consumers. In this article, we
                 address these two trends by presenting a power
                 simulator for FinFET-based on-chip interconnection
                 networks. It estimates both dynamic and leakage power.
                 We present results for various FinFET design styles and
                 temperatures (since leakage power changes drastically
                 with temperature), and show that one FinFET design
                 style may be much superior to another from the power
                 consumption point of view.",
  acknowledgement = ack-nhfb,
  articleno =    "2",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
  keywords =     "FinFETs; interconnection network; power consumption;
                 power simulator",
}

@Article{Liu:2010:RSO,
  author =       "Yang Liu and Chris Dwyer and Alvin R. Lebeck",
  title =        "Routing in self-organizing nano-scale irregular
                 networks",
  journal =      j-JETC,
  volume =       "6",
  number =       "1",
  pages =        "3:1--3:??",
  month =        mar,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1721650.1721653",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Wed Mar 17 14:24:05 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "The integration of novel nanotechnologies onto silicon
                 platforms is likely to increase fabrication defects
                 compared with traditional CMOS technologies.
                 Furthermore, the number of nodes connected with these
                 networks makes acquiring a global defect map
                 impractical. As a result, on-chip networks will provide
                 defect tolerance by self-organizing into irregular
                 topologies. In this scenario, simple static routing
                 algorithms based on regular physical topologies, such
                 as meshes, will be inadequate. Additionally, previous
                 routing approaches for irregular networks assume
                 abundant resources and do not apply to this domain of
                 resource-constrained self-organizing nano-scale
                 networks. Consequently, routing algorithms that work in
                 irregular networks with limited resources are
                 needed.\par

                 In this article, we explore routing for self-organizing
                 nano-scale irregular networks in the context of a
                 Self-Organizing SIMD Architecture (SOSA). Our approach
                 trades configuration time and a small amount of storage
                 for reduced communication latency. We augment an Euler
                 path-based routing technique for trees to generate
                 static shortest paths between certain pairs of nodes
                 while remaining deadlock free. Simulations of several
                 applications executing on SOSA show our proposed
                 routing algorithm can reduce execution time by 8\% to
                 30\%.",
  acknowledgement = ack-nhfb,
  articleno =    "3",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
  keywords =     "data parallel; DNA; nanocomputing; Self-organizing;
                 SIMD",
}

@Article{Kocak:2010:IDT,
  author =       "Taskin Kocak and Dhiraj Pradhan",
  title =        "Introduction to design techniques for energy
                 harvesting",
  journal =      j-JETC,
  volume =       "6",
  number =       "2",
  pages =        "4:1--4:??",
  month =        jun,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1773814.1773815",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Tue Sep 7 08:33:18 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Wenck:2010:SST,
  author =       "Justin Wenck and Jamie Collier and Jeff Siebert and
                 Rajeevan Amirtharajah",
  title =        "Scaling self-timed systems powered by mechanical
                 vibration energy harvesting",
  journal =      j-JETC,
  volume =       "6",
  number =       "2",
  pages =        "5:1--5:??",
  month =        jun,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1773814.1773816",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Tue Sep 7 08:33:18 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Passive energy harvesting from mechanical vibration
                 has wide application in wearable devices and wireless
                 sensors to complement or replace batteries. Energy
                 harvesting efficiency can be increased by eliminating
                 AC/DC conversion. A test chip demonstrating
                 self-timing, power-on reset circuitry, and dynamic
                 memory for energy harvesting AC voltages has been
                 designed in 180 nm CMOS and tested. An energy scalable
                 DSP architecture implements FIR filters that consume as
                 little as 170 pJ per output sample. The on-chip DRAM
                 retains data for up to 28 ms while register data is
                 retained down to a supply voltage of 153 mV. Circuit
                 operation is confirmed for supply frequencies between
                 60 Hz and 1 kHz with power consumption below 130$ \mu
                 $W. Reaching the limits of miniaturization will require
                 approaching the limits of power dissipation. We
                 extrapolate from this DSP architecture to find the
                 minimum volume required for mechanical vibration energy
                 harvesting sensors.",
  acknowledgement = ack-nhfb,
  articleno =    "5",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
  keywords =     "AC power supply; DRAM; energy harvesting; energy-aware
                 systems; integrated circuits; low-power design;
                 power-on reset; scaling; self-timed",
}

@Article{Wang:2010:DCS,
  author =       "W. S. Wang and T. O'Donnell and N. Wang and M. Hayes
                 and B. O'Flynn and C. O'Mathuna",
  title =        "Design considerations of sub-{mW} indoor light energy
                 harvesting for wireless sensor systems",
  journal =      j-JETC,
  volume =       "6",
  number =       "2",
  pages =        "6:1--6:??",
  month =        jun,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1773814.1773817",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Tue Sep 7 08:33:18 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "For most wireless sensor networks, one common and
                 major bottleneck is the limited battery lifetime. The
                 frequent maintenance efforts associated with battery
                 replacement significantly increase the system
                 operational and logistics cost. Unnoticed power
                 failures on nodes will degrade the system reliability
                 and may lead to system failure. In building management
                 applications, to solve this problem, small energy
                 sources such as indoor light energy are promising to
                 provide long-term power to these distributed wireless
                 sensor nodes. This article provides comprehensive
                 design considerations for an indoor light energy
                 harvesting system for building management applications.
                 Photovoltaic cells characteristics, energy storage
                 units, power management circuit design, and power
                 consumption pattern of the target mote are presented.
                 Maximum power point tracking circuits are proposed
                 which significantly increase the power obtained from
                 the solar cells. The novel fast charge circuit reduces
                 the charging time. A prototype was then successfully
                 built and tested in various indoor light conditions to
                 discover the practical issues of the design. The
                 evaluation results show that the proposed prototype
                 increases the power harvested from the PV cells by 30\%
                 and also accelerates the charging rate by 34\% in a
                 typical indoor lighting condition. By entirely
                 eliminating the rechargeable battery as energy storage,
                 the proposed system would expect an operational
                 lifetime 10--20 years instead of the current less than
                 6 months battery lifetime.",
  acknowledgement = ack-nhfb,
  articleno =    "6",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
  keywords =     "Design consideration; energy harvesting; indoor light
                 illuminance; maximum power point tracking; PV cells
                 wireless sensor node; supercapacitor",
}

@Article{Moser:2010:EMF,
  author =       "Clemens Moser and Jian-Jia Chen and Lothar Thiele",
  title =        "An energy management framework for energy harvesting
                 embedded systems",
  journal =      j-JETC,
  volume =       "6",
  number =       "2",
  pages =        "7:1--7:??",
  month =        jun,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1773814.1773818",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Tue Sep 7 08:33:18 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Energy harvesting (also known as energy scavenging) is
                 the process of generating electrical energy from
                 environmental energy sources. There exists a variety of
                 different energy sources such as solar energy, kinetic
                 energy, or thermal energy. In recent years, this term
                 has been frequently applied in the context of small
                 autonomous devices such as wireless sensor nodes. In
                 this article, a framework for energy management in
                 energy harvesting embedded systems is presented. As a
                 possible scenario, we focus on wireless sensor nodes
                 that are powered by solar cells. We demonstrate that
                 classical power management solutions have to be
                 reconceived and/or new problems arise if perpetual
                 operation of the system is required. In particular, we
                 provide a set of algorithms and methods for various
                 application scenarios, including real-time scheduling,
                 application rate control, as well as reward
                 maximization. The goal is to optimize the performance
                 of the application subject to given energy constraints.
                 Our methods optimize the system performance which, for
                 example, allows the usage of smaller solar cells and
                 smaller batteries. Furthermore, we show how to
                 dimension important system parameters like the minimum
                 battery capacity or a sufficient prediction horizon.
                 Our theoretical results are supported by simulations
                 using long-term measurements of solar energy in an
                 outdoor environment. In contrast to previous works, we
                 present a formal framework which is able to capture the
                 performance, the parameters, and the energy model of
                 various energy harvesting systems. We combine different
                 viewpoints, include corresponding simulation results,
                 and provide a thorough discussion of implementation
                 aspects.",
  acknowledgement = ack-nhfb,
  articleno =    "7",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
  keywords =     "embedded systems; energy harvesting; model predictive
                 control; Power management; real-time scheduling; reward
                 maximization",
}

@Article{Mohanty:2010:UDS,
  author =       "Saraju P. Mohanty and Dhiraj K. Pradhan",
  title =        "{ULS}: a dual-{$ V_{th} $} \slash high-$ \kappa $
                 nano-{CMOS} universal level shifter for system-level
                 power management",
  journal =      j-JETC,
  volume =       "6",
  number =       "2",
  pages =        "8:1--8:??",
  month =        jun,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1773814.1773819",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Tue Sep 7 08:33:18 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Power dissipation is a major bottleneck for emerging
                 applications, such as implantable systems, digital
                 cameras, and multimedia processors. Each of these
                 applications is essentially designed as an
                 Analog/Mixed-Signal System-on-a-Chip (AMS-SoC). These
                 AMS-SoCs are typically operated from a single
                 power-supply source which is a battery providing a
                 constant supply voltage. In order to reduce power
                 dissipation of the AMS-SoCs, multiple-supply voltage
                 and/or variable-supply voltage is used as an attractive
                 low-power design approach. In the
                 multiple-/variable-supply voltage AMS-SoCs the use of a
                 DC-to-DC voltage-level shifter is critical. The
                 voltage-level shifter is an overhead when its own power
                 dissipation is high. In this article a new DC-to-DC
                 voltage-level shifter is introduced that performs
                 level-up shifting, level-down shifting, and blocking of
                 voltages and is called Universal Level Shifter (ULS).
                 The ULS is a unique component that reduces dynamic
                 power and leakage of the AMS-SoCs while facilitating
                 their reconfigurability. The system-level architectures
                 for three AMS-SoCs, such as Drug Delivery
                 Nano-Electro-Mechanical-System (DDNEMS), Secure Digital
                 Camera (SDC), and Net-centric Multimedia Processor
                 (NMP) are introduced to demonstrate the use the ULS for
                 system-level power management. The article presents a
                 design flow and an algorithm for optimal design of the
                 ULS using a dual- $ V_{th} $ high-$ \kappa $ technique
                 for efficient realization of ULS. A prototype ULS is
                 presented for 32nm nano-CMOS technology node. The
                 robustness of the ULS design is examined by performing
                 three types of analysis, such as parametric, load, and
                 power. It is observed that the ULS produces a stable
                 output for voltages as low as 0.35 V and loads varying
                 from 50 {\em fF\/} to 120 {\em fF}. The average power
                 dissipation of the ULS with a 82 {\em fF\/} capacitive
                 load is 5 $ \mu ${\em W}.",
  acknowledgement = ack-nhfb,
  articleno =    "8",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
  keywords =     "/metal-gate nano-CMOS; Analog/Mixed-Signal
                 System-on-a-Chip (AMS-SoC); DC-to-DC voltage-level
                 shifter; dual-threshold voltage; high-\kappa low-power
                 design; nanoscale CMOS; Power management; system-level
                 power management",
}

@Article{Dai:2010:ITA,
  author =       "Jianwei Dai and Lei Wang and Fabrizio Lombardi",
  title =        "An information-theoretic analysis of quantum-dot
                 cellular automata for defect tolerance",
  journal =      j-JETC,
  volume =       "6",
  number =       "3",
  pages =        "9:1--9:??",
  month =        aug,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1777401.1777402",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Tue Sep 7 08:33:31 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Quantum-dot cellular automata (QCA) has been advocated
                 as a promising emerging nanotechnology for designing
                 future nanocomputing systems. However, at device level,
                 the large number of expected defects represents a
                 significant hurdle for reliable computation in
                 QCA-based systems. In this paper, we present an
                 information-theoretic approach to investigate the
                 relationship between defect tolerance and redundancy in
                 QCA devices. By modeling defect-prone QCA devices as
                 unreliable information processing media, we determine
                 the information transfer capacity, as bound on the
                 reliability that QCA devices can achieve. The proposed
                 method allows to evaluate the effectiveness of
                 redundancy-based defect tolerance in an effective and
                 quantitative manner.",
  acknowledgement = ack-nhfb,
  articleno =    "9",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
  keywords =     "defect tolerance; information theoretic analysis; QCA;
                 reliability",
}

@Article{Zhang:2010:LPN,
  author =       "Wei Zhang and Niraj K. Jha and Li Shang",
  title =        "Low-power {$3$D} nano\slash {CMOS} hybrid dynamically
                 reconfigurable architecture",
  journal =      j-JETC,
  volume =       "6",
  number =       "3",
  pages =        "10:1--10:??",
  month =        aug,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1777401.1777403",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Tue Sep 7 08:33:31 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "In order to continue technology scaling beyond CMOS,
                 diverse nanoarchitectures have been proposed in recent
                 years based on emerging nanodevices, such as nanotubes,
                 nanowires, etc. Among them, some hybrid nano/CMOS
                 reconfigurable architectures enjoy the advantage that
                 they can be fabricated using photolithography. NATURE
                 is one such architecture that we have proposed
                 recently. It comprises CMOS reconfigurable logic and
                 CMOS fabrication-compatible nano RAMs. It uses
                 distributed high-density and fast nano RAMs as on-chip
                 storage for storing multiple reconfiguration copies,
                 enabling fine-grain cycle-by-cycle reconfiguration. It
                 supports a highly efficient computational model, called
                 temporal logic folding, which makes possible more than
                 an order of magnitude improvement in logic density and
                 area-delay product, significant power reduction, and
                 significant design flexibility in performing area-delay
                 trade-offs.\par

                 In this article, we extend NATURE in various
                 dimensions, evaluating various FPGA approaches in the
                 context of today's emerging technologies. First, we
                 explore the introduction of embedded coarse-grain
                 modules in the fine-grain NATURE architecture and
                 present a unified dynamically reconfigurable
                 architecture, which can significantly enhance NATURE's
                 computation power for data-dominated applications.
                 Second, we explore a 3D architecture for NATURE in
                 which the nano RAM for reconfiguration storage is on
                 one layer and the rest of the CMOS logic on another
                 layer. This leads to further improvements in logic
                 density and performance. Finally, we explore the
                 possibility of using FinFETs, an emerging double-gate
                 CMOS technology, to implement NATURE. Since power
                 consumption is an important consideration in the deep
                 nanometer regime, especially for FPGAs, we present a
                 back-gate biasing methodology for flexible threshold
                 voltage adjustment in FinFETs to significantly reduce
                 NATURE's power consumption. Simulation results
                 demonstrate the efficacy of the proposed methods.",
  acknowledgement = ack-nhfb,
  articleno =    "10",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
  keywords =     "3D design; Coarse-grain; FinFET; runtime
                 reconfiguration",
}

@Article{Zhao:2010:ICP,
  author =       "Yang Zhao and Tao Xu and Krishnendu Chakrabarty",
  title =        "Integrated control-path design and error recovery in
                 the synthesis of digital microfluidic lab-on-chip",
  journal =      j-JETC,
  volume =       "6",
  number =       "3",
  pages =        "11:1--11:??",
  month =        aug,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1777401.1777404",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Tue Sep 7 08:33:31 MDT 2010",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Recent advances in digital microfluidics have led to
                 tremendous interest in miniaturized lab-on-chip devices
                 for biochemical analysis. Synthesis tools have also
                 emerged for the automated design of lab-on-chip from
                 the specifications of laboratory protocols. However,
                 none of these tools consider control flow or address
                 the problem of recovering from fluidic errors that can
                 occur during on-chip bioassay execution. We present a
                 synthesis method that incorporates control paths and an
                 error-recovery mechanism in the design of a digital
                 microfluidic lab-on-chip. Based on error-propagation
                 estimates, we determine the best locations for fluidic
                 checkpoints during biochip synthesis. A microcontroller
                 coordinates the implementation of the
                 control-flow-based bioassay by intercepting the
                 synthesis results that are mapped to the software
                 programs. Real-life bioassay applications are used as
                 case studies to evaluate the proposed design method.
                 For a representative protein assay, compared to a
                 baseline chip design, the biochip with a control path
                 can reduce the completion time by 30\% when errors
                 occur during the implementation of the bioassay.",
  acknowledgement = ack-nhfb,
  articleno =    "11",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
  keywords =     "biochips; Error recovery; microfluidics; synthesis",
}

@Article{Bhoj:2010:GDF,
  author =       "Ajay N. Bhoj and Niraj K. Jha",
  title =        "Gated-diode {FinFET DRAMs}: Device and circuit
                 design-considerations",
  journal =      j-JETC,
  volume =       "6",
  number =       "4",
  pages =        "12:1--12:??",
  month =        dec,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1877745.1877746",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Mon Mar 28 12:17:02 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  acknowledgement = ack-nhfb,
  articleno =    "12",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Saeedi:2010:RCS,
  author =       "Mehdi Saeedi and Morteza Saheb Zamani and Mehdi
                 Sedighi and Zahra Sasanian",
  title =        "Reversible circuit synthesis using a cycle-based
                 approach",
  journal =      j-JETC,
  volume =       "6",
  number =       "4",
  pages =        "13:1--13:??",
  month =        dec,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1877745.1877747",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Mon Mar 28 12:17:02 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  acknowledgement = ack-nhfb,
  articleno =    "13",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Thapliyal:2010:DRS,
  author =       "Himanshu Thapliyal and Nagarajan Ranganathan",
  title =        "Design of reversible sequential circuits optimizing
                 quantum cost, delay, and garbage outputs",
  journal =      j-JETC,
  volume =       "6",
  number =       "4",
  pages =        "14:1--14:??",
  month =        dec,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1877745.1877748",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Mon Mar 28 12:17:02 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  acknowledgement = ack-nhfb,
  articleno =    "14",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Singh:2010:CPD,
  author =       "Montek Singh and Steven M. Nowick",
  title =        "Call for Papers: Deadline: {March 15, 2011}",
  journal =      j-JETC,
  volume =       "6",
  number =       "4",
  pages =        "15:1--15:??",
  month =        dec,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1877745.1877749",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Mon Mar 28 12:17:02 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  acknowledgement = ack-nhfb,
  articleno =    "15",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Das:2011:ISI,
  author =       "Shamik Das and Garrett S. Rose",
  title =        "Introduction to Special Issue: Highlights of
                 {NANOARCH'09}",
  journal =      j-JETC,
  volume =       "7",
  number =       "1",
  pages =        "1:1--1:??",
  month =        jan,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1899390.1899391",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Mon Mar 28 12:17:03 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  acknowledgement = ack-nhfb,
  articleno =    "1",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Dingler:2011:PEI,
  author =       "Aaron Dingler and Michael T. Niemier and Xiaobo Sharon
                 Hu and Evan Lent",
  title =        "Performance and Energy Impact of Locally Controlled
                 {NML} Circuits",
  journal =      j-JETC,
  volume =       "7",
  number =       "1",
  pages =        "2:1--2:??",
  month =        jan,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1899390.1899392",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Mon Mar 28 12:17:03 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  acknowledgement = ack-nhfb,
  articleno =    "2",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Gaillardon:2011:MNB,
  author =       "P.-E. Gaillardon and F. Clermidy and I. O'Connor and
                 J. Liu and M. Amadou and G. Nicolescu",
  title =        "Matrix Nanodevice-Based Logic Architectures and
                 Associated Functional Mapping Method",
  journal =      j-JETC,
  volume =       "7",
  number =       "1",
  pages =        "3:1--3:??",
  month =        jan,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1899390.1899393",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Mon Mar 28 12:17:03 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  acknowledgement = ack-nhfb,
  articleno =    "3",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Haron:2011:RRN,
  author =       "Nor Zaidi Haron and Said Hamdioui",
  title =        "Redundant Residue Number System Code for
                 Fault-Tolerant Hybrid Memories",
  journal =      j-JETC,
  volume =       "7",
  number =       "1",
  pages =        "4:1--4:??",
  month =        jan,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1899390.1899394",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Mon Mar 28 12:17:03 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Shang:2011:INC,
  author =       "Li Shang and Qianfan Xu",
  title =        "Introduction to nanophotonic communication technology
                 integration",
  journal =      j-JETC,
  volume =       "7",
  number =       "2",
  pages =        "5:1--5:??",
  month =        jun,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1970406.1970407",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Thu Aug 18 12:25:12 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  acknowledgement = ack-nhfb,
  articleno =    "5",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Beausoleil:2011:LSI,
  author =       "Raymond G. Beausoleil",
  title =        "Large-scale integrated photonics for high-performance
                 interconnects",
  journal =      j-JETC,
  volume =       "7",
  number =       "2",
  pages =        "6:1--6:??",
  month =        jun,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1970406.1970408",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Thu Aug 18 12:25:12 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  acknowledgement = ack-nhfb,
  articleno =    "6",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Biberman:2011:PNC,
  author =       "Aleksandr Biberman and Kyle Preston and Gilbert Hendry
                 and Nicol{\'a}s Sherwood-Droz and Johnnie Chan and
                 Jacob S. Levy and Michal Lipson and Keren Bergman",
  title =        "Photonic network-on-chip architectures using
                 multilayer deposited silicon materials for
                 high-performance chip multiprocessors",
  journal =      j-JETC,
  volume =       "7",
  number =       "2",
  pages =        "7:1--7:??",
  month =        jun,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1970406.1970409",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Thu Aug 18 12:25:12 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  acknowledgement = ack-nhfb,
  articleno =    "7",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Li:2011:IHN,
  author =       "Zheng Li and Moustafa Mohamed and Xi Chen and Hongyu
                 Zhou and Alan Mickelson and Li Shang and Manish
                 Vachharajani",
  title =        "{Iris}: a hybrid nanophotonic network design for
                 high-performance and low-power on-chip communication",
  journal =      j-JETC,
  volume =       "7",
  number =       "2",
  pages =        "8:1--8:??",
  month =        jun,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1970406.1970410",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Thu Aug 18 12:25:12 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  acknowledgement = ack-nhfb,
  articleno =    "8",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Cianchetti:2011:LLH,
  author =       "Mark J. Cianchetti and David H. Albonesi",
  title =        "A low-latency, high-throughput on-chip optical router
                 architecture for future chip multiprocessors",
  journal =      j-JETC,
  volume =       "7",
  number =       "2",
  pages =        "9:1--9:??",
  month =        jun,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1970406.1970411",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Thu Aug 18 12:25:12 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  acknowledgement = ack-nhfb,
  articleno =    "9",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Zhang:2011:FBP,
  author =       "Meng Zhang and Niraj K. Jha",
  title =        "{FinFET}-Based Power Management for Improved {DPA}
                 Resistance with Low Overhead",
  journal =      j-JETC,
  volume =       "7",
  number =       "3",
  pages =        "10:1--10:??",
  month =        aug,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2000502.2000503",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Thu Aug 18 12:25:13 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Differential power analysis (DPA) is a side-channel
                 attack that statistically analyzes the power
                 consumption of a cryptographic system to obtain secret
                 information. This type of attack is well known as a
                 major threat to information security. Effective
                 solutions with low energy and area cost for improved
                 DPA resistance are urgently needed, especially for
                 energy-constrained modern devices that are often in the
                 physical proximity of attackers. This article presents
                 a novel countermeasure against DPA attacks on smart
                 cards and other digital ICs based on FinFETs, an
                 emerging substitute for bulk CMOS at the 22nm
                 technology node and beyond. We exploit the adaptive
                 power management characteristic of FinFETs to generate
                 a high level of noise at critical moments in the
                 execution of a cryptosystem to thwart DPA attacks.",
  acknowledgement = ack-nhfb,
  articleno =    "10",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Choi:2011:EQI,
  author =       "Byung-Soo Choi and Rodney {Van Meter}",
  title =        "On the Effect of Quantum Interaction Distance on
                 Quantum Addition Circuits",
  journal =      j-JETC,
  volume =       "7",
  number =       "3",
  pages =        "11:1--11:17",
  month =        aug,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2000502.2000504",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Thu Aug 18 12:25:13 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "We investigate the theoretical limits of the effect of
                 the quantum interaction distance on the speed of exact
                 quantum addition circuits. For this study, we exploit
                 graph embedding for quantum circuit analysis. We study
                 a logical mapping of qubits and gates of any $ \Omega
                 (\log n)$-depth quantum adder circuit for two $n$-qubit
                 registers onto a practical architecture, which limits
                 interaction distance to the nearest neighbors only and
                 supports only one- and two-qubit logical gates.
                 Unfortunately, on the chosen $k$-dimensional practical
                 architecture, we prove that the depth lower bound of
                 any exact quantum addition circuits is no longer $
                 \Omega (\log n)$, but $ \Omega (\root k \of n)$.",
  acknowledgement = ack-nhfb,
  articleno =    "11",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Goren:2011:DAN,
  author =       "Sezer G{\"o}ren and H. Fatih Ugurdag and Okan Palaz",
  title =        "Defect-Aware Nanocrossbar Logic Mapping through Matrix
                 Canonization Using Two-Dimensional Radix Sort",
  journal =      j-JETC,
  volume =       "7",
  number =       "3",
  pages =        "12:1--12:??",
  month =        aug,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2000502.2000505",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Thu Aug 18 12:25:13 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Nanocrossbars (i.e., nanowire crossbars) offer extreme
                 logic densities but come with very high defect rates;
                 stuck-open/closed, broken nanowires. Achieving
                 reasonable yield and utilization requires logic mapping
                 that is defect-aware even at the crosspoint level. Such
                 logic mapping works with a defect map per each
                 manufactured chip. The problem can be expressed as
                 matching of two bipartite graphs; one for the logic to
                 be implemented and other for the nanocrossbar. This
                 article shows that the problem becomes a Bipartite
                 SubGraph Isomorphism (BSGI) problem within
                 sub-nanocrossbars free of stuck-closed faults. Our
                 heuristic KNS-2DS is an iterative rough canonizer with
                 approximately O(N2) complexity followed by an O(N3)
                 matching algorithm.",
  acknowledgement = ack-nhfb,
  articleno =    "12",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Devadoss:2011:PQT,
  author =       "Rajeswari Devadoss and Kolin Paul and M.
                 Balakrishnan",
  title =        "{p-QCA}: a Tiled Programmable Fabric Architecture
                 Using Molecular Quantum-Dot Cellular Automata",
  journal =      j-JETC,
  volume =       "7",
  number =       "3",
  pages =        "13:1--13:??",
  month =        aug,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2000502.2000506",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Thu Aug 18 12:25:13 MDT 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Quantum-dot cellular automata is an interesting
                 computation fabric with many never-seen-before
                 properties. However, no programmable fabric scheme has
                 utilized all these properties effectively. We propose
                 an architecture for a programmable device using
                 molecular QCA which exploits all the specialities of
                 the fabric. The architecture taps the flexibility
                 provided by the clocking system of molecular QCA to
                 build a simple tile-based programmable device with the
                 3-input Majority gate as the fundamental logic
                 element.",
  acknowledgement = ack-nhfb,
  articleno =    "13",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Singh:2011:ISI,
  author =       "Montek Singh and Steven M. Nowick",
  title =        "Introduction to Special Issue: Asynchrony in System
                 Design",
  journal =      j-JETC,
  volume =       "7",
  number =       "4",
  pages =        "14:1--14:??",
  month =        dec,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2043643.2043644",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Thu Dec 15 09:46:08 MST 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  acknowledgement = ack-nhfb,
  articleno =    "14",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Vacca:2011:ASN,
  author =       "Marco Vacca and Mariagrazia Graziano and Maurizio
                 Zamboni",
  title =        "Asynchronous Solutions for Nanomagnetic Logic
                 Circuits",
  journal =      j-JETC,
  volume =       "7",
  number =       "4",
  pages =        "15:1--15:??",
  month =        dec,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2043643.2043645",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Thu Dec 15 09:46:08 MST 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "In the years to come new solutions will be required to
                 overcome the limitations of scaled CMOS technology. One
                 approach is to adopt Nano-Magnetic Logic Circuits,
                 highly appealing for their extremely reduced power
                 consumption. Despite the interesting nature of this
                 approach, many problems arise when this technology is
                 considered for real designs. The wire is the most
                 critical of these problems from the circuit
                 implementation point of view. It works as a pipelined
                 interconnection, and its delay in terms of clock cycles
                 depends on its length. Serious complications arise at
                 the design phase, both in terms of synthesis and of
                 physical design.",
  acknowledgement = ack-nhfb,
  articleno =    "15",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Zhang:2011:NPD,
  author =       "Xuefu Zhang and Delong Shang and Fei Xia and Alex
                 Yakovlev",
  title =        "A Novel Power Delivery Method for Asynchronous Loads
                 in Energy Harvesting Systems",
  journal =      j-JETC,
  volume =       "7",
  number =       "4",
  pages =        "16:1--16:??",
  month =        dec,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2043643.2043646",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Thu Dec 15 09:46:08 MST 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "For systems depending on power harvesting, a
                 fundamental contradiction in the power delivery chain
                 has existed between conventional synchronous
                 computational loads requiring relatively stable Vdd and
                 power harvesters unable to supply it. DC/DC conversion
                 has therefore been an integral part of such systems to
                 resolve this contradiction. On the other hand,
                 asynchronous computational loads, in addition to their
                 potential power-saving capabilities, can be made
                 tolerant to a much wider range of Vdd variance. This
                 may open up opportunities for much more energy
                 efficient methods of power delivery. This article
                 presents in-depth investigations into the behavior and
                 performance of different on-chip power delivery methods
                 driving both asynchronous and synchronous loads
                 directly from a harvester source.",
  acknowledgement = ack-nhfb,
  articleno =    "16",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Plana:2011:SDI,
  author =       "Luis A. Plana and David Clark and Simon Davidson and
                 Steve Furber and Jim Garside and Eustace Painkras and
                 Jeffrey Pepper and Steve Temple and John Bainbridge",
  title =        "{SpiNNaker}: Design and Implementation of a {GALS}
                 Multicore {System-on-Chip}",
  journal =      j-JETC,
  volume =       "7",
  number =       "4",
  pages =        "17:1--17:??",
  month =        dec,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2043643.2043647",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Thu Dec 15 09:46:08 MST 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "The design and implementation of globally asynchronous
                 locally synchronous systems-on-chip is a challenging
                 activity. The large size and complexity of the systems
                 require the use of computer-aided design (CAD) tools
                 but, unfortunately, most tools do not work adequately
                 with asynchronous circuits. This article describes the
                 successful design and implementation of SpiNNaker, a
                 GALS multicore system-on-chip. The process was
                 completed using commercial CAD tools from synthesis to
                 layout. A hierarchical methodology was devised to deal
                 with the asynchronous sections of the system,
                 encapsulating and validating timing assumptions at each
                 level. The crossbar topology combined with a pipelined
                 asynchronous fabric implementation allows the on-chip
                 network to meet the stringent requirements of the
                 system.",
  acknowledgement = ack-nhfb,
  articleno =    "17",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Galceran-Oms:2011:MTU,
  author =       "Marc Galceran-Oms and Alexander Gotmanov and Jordi
                 Cortadella and Mike Kishinevsky",
  title =        "Microarchitectural Transformations Using Elasticity",
  journal =      j-JETC,
  volume =       "7",
  number =       "4",
  pages =        "18:1--18:??",
  month =        dec,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2043643.2043648",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Thu Dec 15 09:46:08 MST 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Elasticity is a paradigm that tolerates the variations
                 in computation and communication delays. By applying
                 elastic transformations that allow varying the original
                 timing, circuits can be optimized beyond the
                 conventional rigid transformations that do not modify
                 the external timing. Pipelining is one of the classical
                 techniques to improve the throughput of a circuit. This
                 article reveals how elasticity can be effectively and
                 practically used to derive pipelined circuits by using
                 correct-by-construction transformations that can be
                 fully automated. Two designs, one of them industrial,
                 are used to demonstrate how the area-performance
                 trade-off can be explored using elasticity.",
  acknowledgement = ack-nhfb,
  articleno =    "18",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Sheikh:2011:EEP,
  author =       "Basit Riaz Sheikh and Rajit Manohar",
  title =        "{Energy-Efficient} Pipeline Templates for
                 {High-Performance} Asynchronous Circuits",
  journal =      j-JETC,
  volume =       "7",
  number =       "4",
  pages =        "19:1--19:??",
  month =        dec,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2043643.2043649",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Thu Dec 15 09:46:08 MST 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "We present two novel energy-efficient pipeline
                 templates for high throughput asynchronous circuits.
                 The proposed templates, called N-P and N-Inverter
                 pipelines, use a single-track handshake protocol. There
                 are multiple stages of logic within each pipeline. The
                 proposed techniques minimize handshake overheads
                 associated with input tokens and intermediate logic
                 nodes within a pipeline template. Each template can
                 pack a significant amount of logic in a single stage,
                 while still maintaining a fast cycle time of only 18
                 transitions. Noise and timing robustness constraints of
                 our pipelined circuits are quantified across all
                 process corners. We present completion detection scheme
                 based on wide NOR gates, which results in significant
                 latency and energy savings especially as the number of
                 outputs increase.",
  acknowledgement = ack-nhfb,
  articleno =    "19",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Matherat:2011:RCC,
  author =       "Philippe Matherat and Marc-Thierry Jaekel",
  title =        "Relativistic Causality and Clockless Circuits",
  journal =      j-JETC,
  volume =       "7",
  number =       "4",
  pages =        "20:1--20:??",
  month =        dec,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2043643.2043650",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Thu Dec 15 09:46:08 MST 2011",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Time plays a crucial role in the performance of
                 computing systems. The accurate modelling of logical
                 devices, and of their physical implementations,
                 requires an appropriate representation of time and of
                 all properties that depend on this notion. The need for
                 a proper model, particularly acute in the design of
                 clockless delay-insensitive (DI) circuits, leads one to
                 reconsider the classical descriptions of time and of
                 the resulting order and causal relations satisfied by
                 logical operations. This questioning meets the
                 criticisms of classical spacetime formulated by
                 Einstein when founding relativity theory and is
                 answered by relativistic conceptions of time and
                 causality. Applying this approach to clockless circuits
                 and considering the trace formalism, we rewrite
                 Udding's rules, which characterize communications
                 between DI components. We exhibit their intrinsic
                 relation with relativistic causality. For that purpose,
                 we introduce relativistic generalizations of traces,
                 called R-traces, which provide a pertinent description
                 of communications and compositions of DI components.",
  acknowledgement = ack-nhfb,
  articleno =    "20",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Crocker:2012:RPA,
  author =       "Michael Crocker and Michael Niemier and X. Sharon Hu",
  title =        "A Reconfigurable {PLA} Architecture for Nanomagnet
                 Logic",
  journal =      j-JETC,
  volume =       "8",
  number =       "1",
  pages =        "1:1--1:??",
  month =        feb,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2093145.2093146",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Tue Feb 28 16:37:42 MST 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "In order to continue the performance and scaling
                 trends that we have come to expect from Moore's Law,
                 many emergent computational models, devices, and
                 technologies are actively being studied to either
                 replace or augment CMOS technology. Nanomagnet Logic
                 (NML) is one such alternative. NML operates at room
                 temperature, it has the potential for low power
                 consumption, and it is CMOS compatible. In this
                 article, we present an NML programmable logic array
                 (PLA) based on a previously proposed reprogrammable
                 quantum-dot cellular automata PLA design. We also
                 discuss the fabrication and simulation validation of
                 the circuit structures unique to the NML PLA, present
                 area, energy, and delay estimates for the NML PLA,
                 compare the area of NML PLAs to other reprogrammable
                 nanotechnologies, and analyze how architectural-level
                 redundancy will affect performance and defect tolerance
                 in NML PLAs.",
  acknowledgement = ack-nhfb,
  articleno =    "1",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Henry:2012:TNH,
  author =       "Michael B. Henry and Leyla Nazhandali",
  title =        "From Transistors to {NEMS}: Highly Efficient
                 Power-Gating of {CMOS} Circuits",
  journal =      j-JETC,
  volume =       "8",
  number =       "1",
  pages =        "2:1--2:??",
  month =        feb,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2093145.2093147",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Tue Feb 28 16:37:42 MST 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "A rapidly growing class of battery constrained
                 electronic applications are those with very long sleep
                 periods, such as structural health monitoring systems,
                 biomedical implants, and wireless border security
                 cameras. The traditional method for sleep-mode power
                 reduction, transistor power gating, has drawbacks,
                 including performance loss and residual leakage. This
                 article presents a thorough evaluation of a new
                 nanotechnology-enabled power gating structure,
                 CMOS-compatible NEMS switches, in the presence of
                 aggressive supply voltage scaling. Due to the infinite
                 off-resistance of the NEMS switches, the average power
                 consumption of an FFT processor performing 1 FFT per
                 hour drops by around 30 times compared to a
                 transistor-based power gating implementation.",
  acknowledgement = ack-nhfb,
  articleno =    "2",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Tolbert:2012:MDA,
  author =       "Jeremy R. Tolbert and Pratik Kabali and Simeranjit
                 Brar and Saibal Mukhopadhyay",
  title =        "Modeling and Designing for Accuracy and Energy
                 Efficiency in Wireless Electroencephalography Systems",
  journal =      j-JETC,
  volume =       "8",
  number =       "1",
  pages =        "3:1--3:??",
  month =        feb,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2093145.2093148",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Tue Feb 28 16:37:42 MST 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Remote wireless monitoring of physiological signals
                 has emerged as a key enabler for biotelemetry and can
                 significantly improve the delivery of healthcare.
                 Improving the energy efficiency and battery lifetime of
                 the monitoring units without sacrificing the acquired
                 signal quality is a key challenge in large-scale
                 deployment of bioelectronic systems for remote wireless
                 monitoring. In this article, we present a design
                 methodology for accuracy aware, energy efficient
                 wireless monitoring of electroencephalography (EEG)
                 data. The proposed design performs a real-time accuracy
                 energy trade-off by controlling the volume of
                 transmitted data based on the information content in
                 the EEG signal. We consider the effect of different
                 system parameters in order to design an optimal
                 system.",
  acknowledgement = ack-nhfb,
  articleno =    "3",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Naruse:2012:SDN,
  author =       "Makoto Naruse and Ferdinand Peper and Kouichi Akahane
                 and Naokatsu Yamamoto and Tadashi Kawazoe and Naoya
                 Tate and Motoichi Ohtsu",
  title =        "Skew Dependence of Nanophotonic Devices Based on
                 Optical Near-Field Interactions",
  journal =      j-JETC,
  volume =       "8",
  number =       "1",
  pages =        "4:1--4:??",
  month =        feb,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2093145.2093149",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Tue Feb 28 16:37:42 MST 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "We examine the timing dependence of nanophotonic
                 devices based on optical excitation transfer via
                 optical near-field interactions at the nanometer scale.
                 We theoretically analyze the dynamic behavior of a
                 two-input nanophotonic switch composed of three quantum
                 dots based on a density matrix formalism while assuming
                 arrival-time differences, or skew, between the inputs.
                 The analysis reveals that the nanophotonic switch is
                 resistant to a skew longer than the input signal
                 duration, and the tolerance to skew is asymmetric with
                 respect to the two inputs. The skew dependence is also
                 experimentally examined based on near-field
                 spectroscopy of InGaAs quantum dots, showing good
                 agreement with the theory.",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Ye:2012:TBH,
  author =       "Yaoyao Ye and Jiang Xu and Xiaowen Wu and Wei Zhang
                 and Weichen Liu and Mahdi Nikdast",
  title =        "A Torus-Based Hierarchical Optical-Electronic
                 {Network-on-Chip} for Multiprocessor {System-on-Chip}",
  journal =      j-JETC,
  volume =       "8",
  number =       "1",
  pages =        "5:1--5:??",
  month =        feb,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2093145.2093150",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Tue Feb 28 16:37:42 MST 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Networks-on-chip (NoCs) are emerging as a key on-chip
                 communication architecture for multiprocessor
                 systems-on-chip (MPSoCs). Optical communication
                 technologies are introduced to NoCs in order to empower
                 ultra-high bandwidth with low power consumption.
                 However, in existing optical NoCs, communication
                 locality is poorly supported, and the importance of
                 floorplanning is overlooked. These significantly limit
                 the power efficiency and performance of optical NoCs.
                 In this work, we address these issues and propose a
                 torus-based hierarchical hybrid optical-electronic NoC,
                 called THOE. THOE takes advantage of both electrical
                 and optical routers and interconnects in a hierarchical
                 manner. It employs several new techniques including
                 floorplan optimization, an adaptive power control
                 mechanism, low-latency control protocols, and hybrid
                 optical-electrical routers with a low-power optical
                 switching fabric.",
  acknowledgement = ack-nhfb,
  articleno =    "5",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Manem:2012:DCM,
  author =       "H. Manem and J. Rajendran and G. S. Rose",
  title =        "Design Considerations for Multilevel {{CMOS\slash}
                 Nano} Memristive Memory",
  journal =      j-JETC,
  volume =       "8",
  number =       "1",
  pages =        "6:1--6:??",
  month =        feb,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2093145.2093151",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Tue Feb 28 16:37:42 MST 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "With technology migration into nano and molecular
                 scales several hybrid CMOS/nano logic and memory
                 architectures have been proposed that aim to achieve
                 high device density with low power consumption. The
                 discovery of the memristor has further enabled the
                 realization of denser nanoscale logic and memory
                 systems by facilitating the implementation of
                 multilevel logic. This work describes the design of
                 such a multilevel nonvolatile memristor memory system,
                 and the design constraints imposed in the realization
                 of such a memory. In particular, the limitations on
                 load, bank size, number of bits achievable per device,
                 placed by the required noise margin for accurately
                 reading and writing the data stored in a device are
                 analyzed.",
  acknowledgement = ack-nhfb,
  articleno =    "6",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Bhunia:2012:ISI,
  author =       "Swarup Bhunia and Darrin J. Young",
  title =        "Introduction to Special Issue on Implantable
                 Electronics",
  journal =      j-JETC,
  volume =       "8",
  number =       "2",
  pages =        "7:1--7:??",
  month =        jun,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2180878.2180879",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Sat Jun 23 12:02:51 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  acknowledgement = ack-nhfb,
  articleno =    "7",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Ko:2012:EHC,
  author =       "Wen H. Ko",
  title =        "Early History and Challenges of Implantable
                 Electronics",
  journal =      j-JETC,
  volume =       "8",
  number =       "2",
  pages =        "8:1--8:??",
  month =        jun,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2180878.2180880",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Sat Jun 23 12:02:51 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Implantable systems for biomedical research and
                 clinical care are now a flourishing field of activities
                 in academia as well as industrial institutions. The
                 broad field includes experimental explorations in
                 electronics, mechanical, chemical, and biological
                 components and systems, and the combination of all
                 these. Today virtually all implants involve both
                 electronic circuits and
                 micro-electro-mechanical-systems (MEMS). This article
                 offers a very brief glance back at the early history of
                 implant electronics in the period from the 1950s to the
                 1970s, by employing selected examples from the author's
                 research. This short review also discusses the
                 challenges of implantable electronics at present, and
                 suggests some potentially important trends in the
                 future research and development of implantable
                 microsystems. It is aimed as an introduction of
                 implantable/attached electronic systems to research
                 engineers that are interested in implantable systems as
                 a section of Biomedical Instrumentations.",
  acknowledgement = ack-nhfb,
  articleno =    "8",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Salam:2012:ICL,
  author =       "Muhammad Tariqus Salam and Mohamad Sawan and Dang Khoa
                 Nguyen",
  title =        "Implantable Closed-Loop Epilepsy Prosthesis: Modeling,
                 Implementation and Validation",
  journal =      j-JETC,
  volume =       "8",
  number =       "2",
  pages =        "9:1--9:??",
  month =        jun,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2180878.2180881",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Sat Jun 23 12:02:51 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "In this article, we present an implantable closed-loop
                 epilepsy prosthesis, which is dedicated to
                 automatically detect seizure onsets based on
                 intracerebral electroencephalographic (icEEG)
                 recordings from intracranial electrode contacts and
                 provide an electrical stimulation feedback to the same
                 contacts in order to disrupt these seizures. A novel
                 epileptic seizure detector and a dedicated electrical
                 stimulator were assembled together with common
                 recording electrodes to complete the proposed
                 prosthesis. The seizure detector was implemented in
                 CMOS 0.18-$ \mu $ m by incorporating a new seizure
                 detection algorithm that models time-amplitude and
                 -frequency relationship in icEEG. The detector was
                 validated offline on ten patients with refractory
                 epilepsy and showed excellent performance for early
                 detection of seizures. The electrical stimulator, used
                 for suppressing the developing seizure, is composed of
                 two biphasic channels and was assembled with embedded
                 FPGA in a miniature PCB. The stimulator efficiency was
                 evaluated on cadaveric animal brain tissue in an in
                 vitro morphologic electrical model. Spatial
                 characteristics of the voltage distribution in cortex
                 were assessed in an attempt to identify optimal
                 stimulation parameters required to affect the suspected
                 epileptic focus. The experimental results suggest that
                 lower frequency stimulation parameters cause
                 significant amount of shunting of current through the
                 cerebrospinal fluid; however higher frequency
                 stimulation parameters produce effective spatial
                 voltage distribution with lower stimulation charge.",
  acknowledgement = ack-nhfb,
  articleno =    "9",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Sharad:2012:LPA,
  author =       "Mrigank Sharad and Sumeet K. Gupta and Shriram
                 Raghunathan and Pedro P. Irazoqui and Kaushik Roy",
  title =        "Low-Power Architecture for Epileptic Seizure Detection
                 Based on Reduced Complexity {DWT}",
  journal =      j-JETC,
  volume =       "8",
  number =       "2",
  pages =        "10:1--10:??",
  month =        jun,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2180878.2180882",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Sat Jun 23 12:02:51 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "In this article, we present a low-power,
                 user-programmable architecture for discrete wavelet
                 transform (DWT) based epileptic seizure detection
                 algorithm. A simplified, low-pass filter (LPF)-only-DWT
                 technique is employed in which energy contents of
                 different frequency bands are obtained by subtracting
                 quasi-averaged, consecutive LPF outputs. Training phase
                 is used to identify the range of critical DWT
                 coefficients that are in turn used to set
                 patient-specific system level parameters for minimizing
                 power consumption. The proposed optimizations allow the
                 design to work at significantly lower power in the
                 normal operation mode. The system has been tested on
                 neural data obtained from kainate-treated rats. The
                 design was implemented in TSMC-65nm technology and
                 consumes less than 550-nW power at 250-mV supply.",
  acknowledgement = ack-nhfb,
  articleno =    "10",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Majerus:2012:WUL,
  author =       "Steve J. A. Majerus and Steven L. Garverick and
                 Michael A. Suster and Paul C. Fletter and Margot S.
                 Damaser",
  title =        "Wireless, Ultra-Low-Power Implantable Sensor for
                 Chronic Bladder Pressure Monitoring",
  journal =      j-JETC,
  volume =       "8",
  number =       "2",
  pages =        "11:1--11:??",
  month =        jun,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2180878.2180883",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Sat Jun 23 12:02:51 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "The wireless implantable/intracavity micromanometer
                 (WIMM) system was designed to fulfill the unmet need
                 for a chronic bladder pressure sensing device in
                 urological fields such as urodynamics for diagnosis and
                 neuromodulation for bladder control. Neuromodulation in
                 particular would benefit from a wireless bladder
                 pressure sensor which could provide real-time pressure
                 feedback to an implanted stimulator, resulting in
                 greater bladder capacity while using less power. The
                 WIMM uses custom integrated circuitry, a MEMS
                 transducer, and a wireless antenna to transmit pressure
                 telemetry at a rate of 10 Hz. Aggressive power
                 management techniques yield an average current draw of
                 $ 9 \mu $A from a 3.6-Volt micro-battery, which
                 minimizes the implant size. Automatic pressure offset
                 cancellation circuits maximize the sensing dynamic
                 range to account for drifting pressure offset due to
                 environmental factors, and a custom telemetry protocol
                 allows transmission with minimum overhead. Wireless
                 operation of the WIMM has demonstrated that the
                 external receiver can receive the telemetry packets,
                 and the low power consumption allows for at least 24
                 hours of operation with a 4-hour wireless recharge
                 session.",
  acknowledgement = ack-nhfb,
  articleno =    "11",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Huang:2012:IRD,
  author =       "Yu-Jie Huang and Hsin-Hung Liao and Pen-Li Huang and
                 Tao Wang and Yao-Joe Yang and Yao-Hong Wang and
                 Shey-Shi Lu",
  title =        "An Implantable Release-on-Demand {CMOS} Drug Delivery
                 {SoC} Using Electrothermal Activation Technique",
  journal =      j-JETC,
  volume =       "8",
  number =       "2",
  pages =        "12:1--12:??",
  month =        jun,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2180878.2180884",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Sat Jun 23 12:02:51 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "An implantable system-on-a-chip (SoC) integrating
                 controller/actuation circuitry and 8 individually
                 addressable drug reservoirs is proposed for on-demand
                 drug delivery. It is implemented by standard 0.35- \mu
                 m CMOS technology and post-IC processing. The post-IC
                 processing includes deposition of metallic membranes
                 (200{\AA} Pt/3000{\AA} Ti/200{\AA} Pt) to cap the drug
                 reservoirs, deep dry etching to carve drug reservoirs
                 in silicon as drug containers, and PDMS layer bonding
                 to enlarge the drug storage. Based on electrothermal
                 activation technique, drug releases can be precisely
                 controlled by wireless signals. The wireless
                 controller/actuation circuits including on-off keying
                 (OOK) receiver, microcontroller unit, clock generator,
                 power-on-reset circuit, and switch array are integrated
                 on the same chip, providing patients the ability of
                 remote drug activation and noninvasive therapy
                 modification. Implanted by minimally invasive surgery,
                 this SoC can be used for the precise drug dosing of
                 localized treatment, such as the cancer therapy, or the
                 immediate medication to some emergent diseases, such as
                 heart attack. In vitro experimental results show that
                 the reservoir content can be released successfully
                 through the rupture of the membrane which is appointed
                 by received wireless commands.",
  acknowledgement = ack-nhfb,
  articleno =    "12",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Sun:2012:NMD,
  author =       "Zhenyu Sun and Xiang Chen and Yaojun Zhang and Hai Li
                 and Yiran Chen",
  title =        "Nonvolatile Memories as the Data Storage System for
                 Implantable {ECG} Recorder",
  journal =      j-JETC,
  volume =       "8",
  number =       "2",
  pages =        "13:1--13:??",
  month =        jun,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2180878.2180885",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Sat Jun 23 12:02:51 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "In this article, we propose a data storage system with
                 the emerging nonvolatile memory technologies used for
                 the implantable electrocardiography (ECG) recorder. The
                 proposed storage system can record the digitalized
                 real-time ECG waveforms continuously inside the
                 implantable device and export the stored data to
                 external reader periodically to obtain a long-term
                 backup. Spin transfer torque random access memory
                 (STT-RAM) and spintronic memristor are selected as the
                 storage elements for their nonvolatility, high density,
                 high reliability, low power consumption, good
                 scalability, and CMOS technology compatibility. The new
                 read and write schemes of STT-RAM and spintronic
                 memristors are presented and optimized to fit the
                 specific application scenario. The tradeoffs among data
                 accuracy, chip area, and read/write energy for the
                 different technologies are thoroughly analyzed and
                 compared. Our simulation results show the configuration
                 with a data sampling rate (e.g., 128 Hz) and a
                 quantization resolution (e.g., 12 bits) can record
                 18-hour real-time data within $ \approx 3.6$-mm$^2$
                 chip area when the data storage is built with
                 single-level cell (SLC) STT-RAMs. Daily energy
                 consumption is $ 5.46$ mJ. Utilizing the multilevel
                 cell (MLC) STT-RAMs or the spintronic memristors as the
                 storage elements can further reduce the chip area and
                 decrease energy dissipation.",
  acknowledgement = ack-nhfb,
  articleno =    "13",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Mohanty:2012:SSN,
  author =       "Saraju P. Mohanty",
  title =        "Special section on new circuit and architecture-level
                 solutions for multidiscipline systems",
  journal =      j-JETC,
  volume =       "8",
  number =       "3",
  pages =        "14:1--14:??",
  month =        aug,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2287696.2287697",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Mon Aug 20 15:17:55 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  acknowledgement = ack-nhfb,
  articleno =    "14",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Srivastava:2012:CLV,
  author =       "Ashok Srivastava and Yao Xu and Yang Liu and Ashwani
                 K. Sharma and Clay Mayberry",
  title =        "{CMOS LC} voltage controlled oscillator design using
                 multiwalled and single-walled carbon nanotube wire
                 inductors",
  journal =      j-JETC,
  volume =       "8",
  number =       "3",
  pages =        "15:1--15:??",
  month =        aug,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2287696.2287698",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Mon Aug 20 15:17:55 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "We have utilized our Multiwalled Carbon NanoTube
                 (MWCNT) and Single-Walled Carbon NanoTube (SWCNT)
                 bundle interconnects model in a widely used $ \pi $
                 model to study the performances of MWCNT and SWCNT
                 bundle wire inductors and compared these with copper
                 (Cu) inductors. The calculation results show that the
                 Q-factors of Carbon NanoTube (CNT) wire (SWCNT bundle
                 and MWCNT) inductors are higher than that of the Cu
                 wire inductor. This is mainly due to much lower
                 resistance of CNT and negligible skin effect in carbon
                 nanotubes at higher frequencies. The application of CNT
                 wire inductor in LC VCO is also studied and the
                 Cadence/Spectre simulations show that VCOs with CNT
                 bundle wire inductors have significantly improved
                 performance such as the higher oscillation frequency
                 and lower phase noise due to their smaller resistances
                 and higher Q-factors. It is also noticed that CMOS LC
                 VCO using a SWCNT bundle wire inductor has better
                 performance when compared with the performance of LC
                 VCO using the MWCNT wire inductor due to its lower
                 resistance and higher Q-factor.",
  acknowledgement = ack-nhfb,
  articleno =    "15",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Mahalingam:2012:DCS,
  author =       "Venkataraman Mahalingam and Nagarajan Ranganathan and
                 Ransford {Hyman, Jr.}",
  title =        "Dynamic clock stretching for variation compensation in
                 {VLSI} circuit design",
  journal =      j-JETC,
  volume =       "8",
  number =       "3",
  pages =        "16:1--16:??",
  month =        aug,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2287696.2287699",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Mon Aug 20 15:17:55 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "In the nanometer era, process, voltage, and
                 temperature variations are dominating circuit
                 performance, power, and yield. Over the past few years,
                 statistical optimization methods have been effective in
                 improving yield in the presence of uncertainty due to
                 process variations. However, statistical methods
                 overconsume resources, even in the absence of
                 variations. Hence, to facilitate a better
                 performance-power-yield trade-off, techniques that can
                 dynamically enable variation compensation are becoming
                 necessary. In this article, we propose a dynamic
                 technique that controls the instance of data capture in
                 critical path memory flops, by delaying the clock edge
                 trigger. The methodology employs a dynamic delay
                 detection circuit to identify the uncertainty in delay
                 due to variations and stretches the clock in the
                 destination flip-flops. The delay detection circuit
                 uses a latch and set of combinational gates to
                 dynamically detect and create the slack needed to
                 accommodate the delay due to variations. The Clock
                 Stretching Logic (CSL) is added only to paths, which
                 have a high probability of failure in the presence of
                 variations. The proposed methodology improves the
                 timing yield of the circuit without significant
                 overcompensation. The methodology approach was
                 simulated using Synopsys design tools for circuit
                 synthesis and Cadence tools for placement and routing
                 of the design. Extraction of parasitic of timing
                 information was parsed using Perl scripts and simulated
                 using a simulation program generated in C++.
                 Experimental results based on Monte-Carlo simulations
                 on benchmark circuits indicate considerable improvement
                 in timing yield with negligible area overhead.",
  acknowledgement = ack-nhfb,
  articleno =    "16",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Roy:2012:CAL,
  author =       "Sudip Roy and Debasis Mitra and Bhargab B.
                 Bhattacharya and Krishnendu Chakrabarty",
  title =        "Congestion-aware layout design for high-throughput
                 digital microfluidic biochips",
  journal =      j-JETC,
  volume =       "8",
  number =       "3",
  pages =        "17:1--17:??",
  month =        aug,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2287696.2287700",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Mon Aug 20 15:17:55 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Potential applications of digital microfluidic (DMF)
                 biochips now include several areas of real-life
                 applications like environmental monitoring, water and
                 air pollutant detection, and food processing to name a
                 few. In order to achieve sufficiently high throughput
                 for these applications, several instances of the same
                 bioassay may be required to be executed concurrently on
                 different samples. As a straightforward implementation,
                 several identical biochips can be integrated on a
                 single substrate as a multichip to execute the assay
                 for various samples concurrently. Controlling
                 individual electrodes of such a chip by independent
                 pins may not be acceptable since it increases the cost
                 of fabrication. Thus, in order to keep the overall
                 pin-count within an acceptable bound, all the
                 respective electrodes of these individual pieces are
                 connected internally underneath the chip so that they
                 can be controlled with a single external control pin.
                 In this article, we present an orientation strategy for
                 layout of a multichip that reduces routing congestion
                 and consequently facilitates wire routing for the
                 electrode array. The electrode structure of the
                 individual pieces of the multichip may be either
                 direct-addressable or pin-constrained. The method also
                 supports a hierarchical approach to wire routing that
                 ensures scalability. In this scheme, the size of the
                 biochip in terms of the total number of electrodes may
                 be increased by a factor of four by increasing the
                 number of routing layers by only one. In general, for a
                 multichip with 4 $^n$ identical blocks, ( n + 1) layers
                 are sufficient for wire routing.",
  acknowledgement = ack-nhfb,
  articleno =    "17",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Komerath:2012:RBP,
  author =       "Narayanan Komerath and Aravinda Kar",
  title =        "Retail beamed power using millimeter waves: Survey",
  journal =      j-JETC,
  volume =       "8",
  number =       "3",
  pages =        "18:1--18:??",
  month =        aug,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2287696.2287701",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Mon Aug 20 15:17:55 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Retail delivery of electric power through millimeter
                 waves is relevant in developing areas where the market
                 for communication devices outpaces the power grid
                 infrastructure. It is also a critical component of an
                 evolutionary path towards terrestrial and space-based
                 renewable power generation. Narrow-band power can be
                 delivered as focused beams to receivers near end-users,
                 from central power plants, rural distribution points,
                 UAVs, tethered aerostats, stratospheric airship
                 platforms, or space satellites. The article surveys the
                 available knowledge base on millimeter wave beamed
                 power delivery. It then considers design requirements
                 for a retail beamed power architecture, in the context
                 of rural India where power delivery is lagging behind
                 the demand growth for connectivity. A survey of
                 technology developments relevant to millimeter wave
                 beaming is conducted, and indicates that massive,
                 mass-produced solid-state arrays capable of achieving
                 good efficiency and cost effectiveness are possible in
                 the near term to enable such retail power beaming
                 architectures.",
  acknowledgement = ack-nhfb,
  articleno =    "18",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Palaniswamy:2012:EHI,
  author =       "Ashok Kumar Palaniswamy and Spyros Tragoudas",
  title =        "An efficient heuristic to identify threshold logic
                 functions",
  journal =      j-JETC,
  volume =       "8",
  number =       "3",
  pages =        "19:1--19:??",
  month =        aug,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2287696.2287702",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Mon Aug 20 15:17:55 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "A fast method to identify the given Boolean function
                 as a threshold function with weight assignment is
                 introduced. It characterizes the function based on the
                 parameters that have been defined in the literature.
                 The proposed method is capable to quickly characterize
                 all functions that have less than eight inputs and has
                 been shown to operate fast for functions with as many
                 as forty inputs. Furthermore, comparisons with other
                 existing heuristic methods show huge increase in the
                 number of threshold functions identified, and drastic
                 reduction in time and complexity.",
  acknowledgement = ack-nhfb,
  articleno =    "19",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Xu:2012:EPV,
  author =       "Hu Xu and Vasilis F. Pavlidis and Giovanni {De
                 Micheli}",
  title =        "Effect of process variations in {$3$D} global clock
                 distribution networks",
  journal =      j-JETC,
  volume =       "8",
  number =       "3",
  pages =        "20:1--20:??",
  month =        aug,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2287696.2287703",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Mon Aug 20 15:17:55 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "In three-dimensional (3D) integrated circuits, the
                 effect of process variations on clock skew differs from
                 2D circuits. The combined effect of inter-die and
                 intra-die process variations on the design of 3D clock
                 distribution networks is considered in this article. A
                 statistical clock skew model incorporating both the
                 systematic and random components of process variations
                 is employed to describe this effect. Two regular 3D
                 clock tree topologies are investigated and compared in
                 terms of clock skew variation. The statistical skew
                 model used to describe clock skew variations is
                 verified through Monte-Carlo simulations. The clock
                 skew is shown to change in different ways with the
                 number of planes forming the 3D IC and the clock
                 network architecture. Simulations based on a 45-nm CMOS
                 technology show that the maximum standard deviation of
                 clock skew can vary from 15 ps to 77 ps. Results
                 indicate that simply increasing the number of planes of
                 a 3D IC does not necessarily lead to lower skew
                 variation and higher operating frequencies. A
                 multigroup 3D clock tree topology is proposed to
                 effectively mitigate the variability of clock skew.
                 Tradeoffs between the investigated 3D clock
                 distribution networks and the number of planes
                 comprising a 3D circuit are discussed and related
                 design guidelines are offered. The skew variation in 3D
                 clock trees is also compared with the skew variation of
                 clock grids.",
  acknowledgement = ack-nhfb,
  articleno =    "20",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Kursun:2012:STT,
  author =       "Eren Kursun and Jamil Wakil and Mukta Farooq and
                 Robert Hannon",
  title =        "Spatial and temporal thermal characterization of
                 stacked multicore architectures",
  journal =      j-JETC,
  volume =       "8",
  number =       "3",
  pages =        "21:1--21:??",
  month =        aug,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2287696.2287704",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Mon Aug 20 15:17:55 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Three-dimensional integration provides a new way of
                 performance growth for microprocessor architectures.
                 While a recent studies report promising performance
                 improvement numbers, majority of the processor stacking
                 options are thermally-limited. Elevated stack
                 temperatures have significant effect on the overall
                 energy efficiency and reliability of the processor;
                 they also limit the potential peak performance
                 improvement from the 3D implementation. Thermal
                 characteristics of 3D stacks differ from 2D processors
                 in various ways including: the nature of heat
                 dissipation throughout the stack, thermal conductivity
                 of the 3D structures such as micro-C4 layers, and
                 hotspot interactions among layers. The intensity of the
                 corresponding thermal problems is highly dependent on
                 the 3D technology, processor and stack parameters. In
                 this study we focus on spatial and temporal thermal
                 characteristics of 3D multicore architectures using
                 high-fidelity technology and processor models. Our
                 experimental results highlight the need for integrating
                 detailed thermal models in the design flow, starting
                 with the early design stages. In addition, the reduced
                 time constants and elevated on-chip temperatures
                 indicate faster response time requirements for dynamic
                 thermal management in processor stacking options.",
  acknowledgement = ack-nhfb,
  articleno =    "21",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Liu:2012:RAP,
  author =       "Bao Liu and Xuemei Chen and Fiona Teshome",
  title =        "Resilient and adaptive performance logic",
  journal =      j-JETC,
  volume =       "8",
  number =       "3",
  pages =        "22:1--22:??",
  month =        aug,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2287696.2287705",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Mon Aug 20 15:17:55 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "As VLSI technology continues scaling, increasingly
                 significant parametric variations and increasingly
                 prevalent defects present unprecedented challenges to
                 VLSI design at nanometer scale. Specifically,
                 performance variability has hindered performance
                 scaling, while soft errors become an emerging problem
                 for logic computation at recent technology nodes. In
                 this article, we leverage the existing Totally
                 Self-Checking (TSC)/Strongly Fault-Secure (SFS) logic
                 design techniques, and propose Resilient and Adaptive
                 Performance (RAP) logic for maximum adaptive
                 performance and soft error resilience in nanoscale
                 computing. RAP logic clears all timing errors in the
                 absence of external soft errors, albeit at a higher
                 area/power cost compared with Razor logic. Our
                 experimental results further show that dual-rail static
                 (Domino) RAP logic outperforms alternative
                 Delay-Insensitive (DI) code-based static (Domino) RAP
                 logic with less area, higher performance, and lower
                 power consumption for the large test cases, and
                 achieves an average of 2.29(2.41)$ \times $ performance
                 boost, 2.12(1.91)$ \times $ layout area, and
                 2.38(2.34)$ \times $ power consumption compared with
                 the traditional minimum area static logic based on the
                 Nangate 45-nm open cell library.",
  acknowledgement = ack-nhfb,
  articleno =    "22",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Chang:2012:PED,
  author =       "Kevin Chang and Sujay Deb and Amlan Ganguly and Xinmin
                 Yu and Suman Prasad Sah and Partha Pratim Pande and
                 Benjamin Belzer and Deukhyoun Heo",
  title =        "Performance evaluation and design trade-offs for
                 wireless network-on-chip architectures",
  journal =      j-JETC,
  volume =       "8",
  number =       "3",
  pages =        "23:1--23:??",
  month =        aug,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2287696.2287706",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Mon Aug 20 15:17:55 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Massive levels of integration are making modern
                 multicore chips all pervasive in several domains. High
                 performance, robustness, and energy-efficiency are
                 crucial for the widespread adoption of such platforms.
                 Networks-on-Chip (NoCs) have emerged as communication
                 backbones to enable a high degree of integration in
                 multicore Systems-on-Chip (SoCs). Despite their
                 advantages, an important performance limitation in
                 traditional NoCs arises from planar metal
                 interconnect-based multihop links with high latency and
                 power consumption. This limitation can be addressed by
                 drawing inspiration from the evolution of natural
                 complex networks, which offer great performance-cost
                 trade-offs. Analogous with many natural complex
                 systems, future multicore chips are expected to be
                 hierarchical and heterogeneous in nature as well. In
                 this article we undertake a detailed performance
                 evaluation for hierarchical small-world NoC
                 architectures where the long-range communications links
                 are established through the millimeter-wave wireless
                 communication channels. Through architecture-space
                 exploration in conjunction with novel power-efficient
                 on-chip wireless link design, we demonstrate that it is
                 possible to improve performance of conventional NoC
                 architectures significantly without incurring high area
                 overhead.",
  acknowledgement = ack-nhfb,
  articleno =    "23",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Choi:2012:DQA,
  author =       "Byung-Soo Choi and Rodney {Van Meter}",
  title =        "A {$ \Theta (\sqrt n) $}-depth quantum adder on the
                 {$2$D NTC} quantum computer architecture",
  journal =      j-JETC,
  volume =       "8",
  number =       "3",
  pages =        "24:1--24:??",
  month =        aug,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2287696.2287707",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Mon Aug 20 15:17:55 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "In this work, we propose an adder for the
                 2-Dimensional Nearest-Neighbor, Two-Qubit gate,
                 Concurrent (2D NTC) architecture, designed to match the
                 architectural constraints of many quantum computing
                 technologies. The chosen architecture allows the layout
                 of logical qubits in two dimensions with {\&}sqrt; n
                 columns where each column has {\&}sqrt; n qubits and
                 the concurrent execution of one- and two-qubit gates
                 with nearest-neighbor interaction only. The proposed
                 adder works in three phases. In the first phase, the
                 first column generates the summation output and the
                 other columns do the carry-lookahead operations. In the
                 second phase, these intermediate values are propagated
                 from column to column, preparing for computation of the
                 final carry for each register position. In the last
                 phase, each column, except the first one, generates the
                 summation output using this column-level carry. The
                 depth and the number of qubits of the proposed adder
                 are $ \Theta (\sqrt n) $ and $ O(n) $, respectively.
                 The proposed adder executes faster than the adders
                 designed for the 1D NTC architecture when the length of
                 the input registers $n$ is larger than 51.",
  acknowledgement = ack-nhfb,
  articleno =    "24",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Huang:2012:PDT,
  author =       "Jiale Huang and Minhao Zhu and Shengqi Yang and Pallav
                 Gupta and Wei Zhang and Steven M. Rubin and Gilda
                 Garret{\'o}n and Jin He",
  title =        "A physical design tool for carbon nanotube
                 field-effect transistor circuits",
  journal =      j-JETC,
  volume =       "8",
  number =       "3",
  pages =        "25:1--25:??",
  month =        aug,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2287696.2287708",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Mon Aug 20 15:17:55 MDT 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "In this article, we present a graphical Computer-Aided
                 Design (CAD) environment for the design, analysis, and
                 layout of Carbon NanoTube (CNT) Field-Effect Transistor
                 (CNFET) circuits. This work is motivated by the fact
                 that such a tool currently does not exist in the public
                 domain for researchers. Our tool has been integrated
                 within Electric a very powerful, yet free CAD system
                 for custom design of Integrated Circuits (ICs). The
                 tool supports CNFET schematic and layout entry, rule
                 checking, and HSpice/VerilogA netlist generation. We
                 provide users with a customizable CNFET technology
                 library with the ability to specify $ \lambda $ -based
                 design rules. We showcase the capabilities of our tool
                 by demonstrating the design of a large CNFET standard
                 cell and components library. Meanwhile, HSPICE
                 simulations also have been presented for cell library
                 characterization. We hope that the availability of this
                 tool will invigorate the CAD community to explore novel
                 ideas in CNFET circuit design.",
  acknowledgement = ack-nhfb,
  articleno =    "25",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Pande:2012:ISI,
  author =       "Partha Pratim Pande and Amlan Ganguly",
  title =        "Introduction to the special issue on sustainable and
                 green computing systems",
  journal =      j-JETC,
  volume =       "8",
  number =       "4",
  pages =        "26:1--26:??",
  month =        oct,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2367736.2367737",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Wed Nov 28 17:25:59 MST 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  acknowledgement = ack-nhfb,
  articleno =    "26",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Banerjee:2012:TNZ,
  author =       "Prithviraj Banerjee and Chandrakant Patel and Cullen
                 Bash and Amip Shah and Martin Arlitt",
  title =        "Towards a net-zero data center",
  journal =      j-JETC,
  volume =       "8",
  number =       "4",
  pages =        "27:1--27:??",
  month =        oct,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2367736.2367738",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Wed Nov 28 17:25:59 MST 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "A world consisting of billions of service-oriented
                 client devices and thousands of data centers can
                 deliver a diverse range of services, from social
                 networking to management of our natural resources.
                 However, these services must scale in order to meet the
                 fundamental needs of society. To enable such scaling,
                 the total cost of ownership of the data centers that
                 host the services and comprise the vast majority of
                 service delivery costs will need to be reduced. As
                 energy drives the total cost of ownership of data
                 centers, there is a need for a new paradigm in design
                 and management of data centers that minimizes energy
                 used across their lifetimes, from ``cradle to cradle''.
                 This tutorial article presents a blueprint for a
                 ``net-zero data center'': one that offsets any
                 electricity used from the grid via adequate on-site
                 power generation that gets fed back to the grid at a
                 later time. We discuss how such a data center addresses
                 the total cost of ownership, illustrating that contrary
                 to the oft-held view of sustainability as ``paying more
                 to be green'', sustainable data centers-built on a
                 framework that focuses on integrating supply and demand
                 management from end-to-end-can concurrently lead to
                 lowest cost and lowest environmental impact.",
  acknowledgement = ack-nhfb,
  articleno =    "27",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Garg:2012:TDL,
  author =       "Siddharth Garg and Diana Marculescu and Radu
                 Marculescu",
  title =        "Technology-driven limits on runtime power management
                 algorithms for multiprocessor systems-on-chip",
  journal =      j-JETC,
  volume =       "8",
  number =       "4",
  pages =        "28:1--28:??",
  month =        oct,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2367736.2367739",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Wed Nov 28 17:25:59 MST 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Runtime power management is a critical technique for
                 reducing the energy footprint of digital electronic
                 devices and enabling sustainable computing, since it
                 allows electronic devices to dynamically adapt their
                 power and energy consumption to meet performance
                 requirements. In this article, we consider the case of
                 MultiProcessor Systems-on-Chip (MPSoC) implemented
                 using multiple Voltage and Frequency Islands (VFIs)
                 relying on fine-grained Dynamic Voltage and Frequency
                 Scaling (DVFS) to reduce the system power dissipation.
                 In particular, we present a framework to theoretically
                 analyze the impact of three important technology-driven
                 constraints; (i) reliability-driven upper limits on the
                 maximum supply voltage; (ii) inductive noise-driven
                 constraints on the maximum rate of change of
                 voltage/frequency; and (iii) the impact of
                 manufacturing process variations on the performance of
                 DVFS control for multiple VFI MPSoCs. The proposed
                 analysis is general, in the sense that it is not bound
                 to a specific DVFS control algorithm, but instead
                 focuses on theoretically bounding the performance that
                 any DVFS controller can possibly achieve. Our
                 experimental results on real and synthetic benchmarks
                 show that in the presence of reliability- and
                 temperature-driven constraints on the maximum frequency
                 and maximum frequency increment, any DVFS control
                 algorithm will lose up to 87\% performance in terms of
                 the number of steps required to reach a reference
                 steady state. In addition, increasing process
                 variations can lead to up to 60\% of fabricated chips
                 being unable to meet the specified DVFS control
                 specifications, irrespective of the DVFS algorithm
                 used. Nonetheless, we note that although conventional
                 DVFS might become less effective with technology
                 scaling, it will continue to play an important role in
                 the context of emerging power management techniques,
                 for example, for massively parallel multiprocessor
                 systems where only a subset of cores can be turned on
                 at any given point of time due to total power
                 constraints.",
  acknowledgement = ack-nhfb,
  articleno =    "28",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Ghidini:2012:EEM,
  author =       "Giacomo Ghidini and Sajal K. Das",
  title =        "Energy-efficient {Markov} chain-based duty cycling
                 schemes for greener wireless sensor networks",
  journal =      j-JETC,
  volume =       "8",
  number =       "4",
  pages =        "29:1--29:??",
  month =        oct,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2367736.2367740",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Wed Nov 28 17:25:59 MST 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "To extend the lifetime of a wireless sensor network,
                 sensor nodes usually duty cycle between dormant and
                 active states. Duty cycling schemes are often evaluated
                 in terms of connection delay, connection duration, and
                 duty cycle. In this article, we show with experiments
                 on Sun SPOT sensors that duty cycling time (energy)
                 efficiency, that is, the ratio of time (energy)
                 employed in ancillary operations when switching from
                 and into deep sleep mode, is an important performance
                 metric too. We propose a novel randomized duty cycling
                 scheme based on Markov chains with the goal of (i)
                 reducing the connection delay, while maintaining a
                 given time (energy) efficiency, or (ii) keeping a
                 constant connection delay, while increasing the time
                 (energy) efficiency. Analytical and experimental
                 results demonstrate that the Markov chain-based scheme
                 can improve the performance in terms of connection
                 delay without affecting the time efficiency, or vice
                 versa, as opposed to the trade-off observed in
                 traditional schemes. We extend the proposed duty
                 cycling scheme to a partially randomized scheme, where
                 wireless nodes can switch into active state beyond
                 their schedules when their neighbors are active to
                 anticipate message forwarding. The analytical and
                 experimental results confirm the relationship between
                 connection delay and time efficiency also for this
                 scheme.",
  acknowledgement = ack-nhfb,
  articleno =    "29",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Sego:2012:IDC,
  author =       "Landon H. Sego and Andr{\'e}s M{\'a}rquez and Andrew
                 Rawson and Tahir Cader and Kevin Fox and William I.
                 {Gustafson, Jr.} and Christopher J. Mundy",
  title =        "Implementing the data center energy productivity
                 metric",
  journal =      j-JETC,
  volume =       "8",
  number =       "4",
  pages =        "30:1--30:??",
  month =        oct,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2367736.2367741",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Wed Nov 28 17:25:59 MST 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "As data centers proliferate in size and number, the
                 endeavor to improve their energy efficiency and
                 productivity is becoming increasingly important. We
                 discuss the properties of a number of the proposed
                 metrics of energy efficiency and productivity. In
                 particular, we focus on the Data Center Energy
                 Productivity (DCeP) metric, which is the ratio of
                 useful work produced by the data center to the energy
                 consumed performing that work. We describe our approach
                 for using DCeP as the principal outcome of a designed
                 experiment using a highly instrumented,
                 high-performance computing data center. We found that
                 DCeP was successful in clearly distinguishing different
                 operational states in the data center, thereby
                 validating its utility as a metric for identifying
                 configurations of hardware and software that would
                 improve (or even maximize) energy productivity. We also
                 discuss some of the challenges and benefits associated
                 with implementing the DCeP metric, and we examine the
                 efficacy of the metric in making comparisons within a
                 data center and among data centers.",
  acknowledgement = ack-nhfb,
  articleno =    "30",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Anagnostopoulou:2012:BAM,
  author =       "Vlasia Anagnostopoulou and Susmit Biswas and Heba
                 Saadeldeen and Alan Savage and Ricardo Bianchini and
                 Tao Yang and Diana Franklin and Frederic T. Chong",
  title =        "Barely alive memory servers: Keeping data active in a
                 low-power state",
  journal =      j-JETC,
  volume =       "8",
  number =       "4",
  pages =        "31:1--31:??",
  month =        oct,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2367736.2367742",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Wed Nov 28 17:25:59 MST 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Current resource provisioning schemes in Internet
                 services leave servers less than 50\% utilized almost
                 all the time. At this level of utilization, the
                 servers' energy efficiency is substantially lower than
                 at peak utilization. A solution to this problem could
                 be dynamically consolidating workloads into fewer
                 servers and turning others off. However, services
                 typically resist doing so, because of high response
                 times during reactivation in handling traffic spikes.
                 Moreover, services often want the memory and/or storage
                 of all servers to be readily available at all times. In
                 this article, we propose a family of barely alive
                 active low-power server states that facilitates both
                 fast reactivation and access to memory while in a
                 low-power state. We compare these states to previously
                 proposed active and idle states. In particular, we
                 investigate the impact of load bursts in each
                 energy-saving scheme. We also evaluate the additional
                 benefits of memory access under low-power states with a
                 study of a search service using a cooperative
                 main-memory cache. Finally, we propose a system that
                 combines a barely-alive state with the off state. We
                 find that the barely alive states can reduce service
                 energy consumption by up to 38\%, compared to an
                 energy-oblivious system. We also find that these energy
                 savings are consistent across a large parameter
                 space.",
  acknowledgement = ack-nhfb,
  articleno =    "31",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Sheikh:2012:EPA,
  author =       "Hafiz Fahad Sheikh and Hengxing Tan and Ishfaq Ahmad
                 and Sanjay Ranka and Phanisekhar Bv",
  title =        "Energy- and performance-aware scheduling of tasks on
                 parallel and distributed systems",
  journal =      j-JETC,
  volume =       "8",
  number =       "4",
  pages =        "32:1--32:??",
  month =        oct,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2367736.2367743",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Wed Nov 28 17:25:59 MST 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Enabled by high-speed networking in commercial,
                 scientific, and government settings, the realm of high
                 performance is burgeoning with greater amounts of
                 computational and storage resources. Large-scale
                 systems such as computational grids consume a
                 significant amount of energy due to their massive
                 sizes. The energy and cooling costs of such systems are
                 often comparable to the procurement costs over a year
                 period. In this survey, we will discuss allocation and
                 scheduling algorithms, systems, and software for
                 reducing power and energy dissipation of workflows on
                 the target platforms of single processors, multicore
                 processors, and distributed systems. Furthermore,
                 recent research achievements will be investigated that
                 deal with power and energy efficiency via different
                 power management techniques and application scheduling
                 algorithms. The article provides a comprehensive
                 presentation of the architectural, software, and
                 algorithmic issues for energy-aware scheduling of
                 workflows on single, multicore, and parallel
                 architectures. It also includes a systematic taxonomy
                 of the algorithms developed in the literature based on
                 the overall optimization goals and characteristics of
                 applications.",
  acknowledgement = ack-nhfb,
  articleno =    "32",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Kant:2012:EDC,
  author =       "Krishna Kant and Muthukumar Murugan and David H. C.
                 Du",
  title =        "Enhancing data center sustainability through
                 energy-adaptive computing",
  journal =      j-JETC,
  volume =       "8",
  number =       "4",
  pages =        "33:1--33:??",
  month =        oct,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2367736.2367744",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Wed Nov 28 17:25:59 MST 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "The sustainability concerns of Information Technology
                 (IT) go well beyond energy-efficient computing and
                 require techniques for minimizing environmental impact
                 of IT infrastructure over its entire life-cycle.
                 Traditionally, IT infrastructure is overdesigned at all
                 levels from chips to entire data centers and ecosystem;
                 the paradigm explored in this article is to replace
                 overdesign with rightsizing coupled with smarter
                 control, henceforth referred to as Energy-Adaptive
                 Computing or EAC. The article lays out the challenges
                 of EAC in various environments in terms of the
                 adaptation of the workload and the infrastructure to
                 cope with energy and cooling deficiencies. The article
                 then focuses on implementing EAC in a data center
                 environment, and addresses the problem of simultaneous
                 energy demand and energy supply regulation at multiple
                 levels, work, from servers to the entire data center.
                 The proposed control scheme adapts the assignments of
                 tasks to servers in a way that can cope with the
                 varying energy limitations. The article also presents
                 some experimental results to show how the scheme can
                 continue to meet Quality of Service (QoS) requirements
                 of tasks under energy limitations.",
  acknowledgement = ack-nhfb,
  articleno =    "33",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Abbasi:2012:DGD,
  author =       "Zahra Abbasi and Tridib Mukherjee and Georgios
                 Varsamopoulos and Sandeep K. S. Gupta",
  title =        "{DAHM}: a green and dynamic {Web} application hosting
                 manager across geographically distributed data
                 centers",
  journal =      j-JETC,
  volume =       "8",
  number =       "4",
  pages =        "34:1--34:??",
  month =        oct,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2367736.2367745",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Wed Nov 28 17:25:59 MST 2012",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Dynamic Application Hosting Management (DAHM) is
                 proposed for geographically distributed data centers,
                 which decides on the number of active servers and on
                 the workload share of each data center. DAHM achieves
                 cost-efficient application hosting by taking into
                 account: (i) the spatio-temporal variation of energy
                 cost, (ii) the data center computing and cooling energy
                 efficiency, (iii) the live migration cost, and (iv) any
                 SLA violations due to migration overhead or network
                 delay. DAHM is modeled as fixed-charge min-cost flow
                 and mixed integer programming for stateless and
                 stateful applications, respectively, and it is shown
                 NP-hard. We also develop heuristic algorithms and
                 prove, when applications are stateless and servers have
                 an identical power consumption model, that the
                 approximation ratio on the minimum total cost is
                 bounded by the number of data centers. Further, the
                 heuristics are evaluated in a simulation study using
                 realistic parameter data; compared to a
                 performance-oriented application assignment, that is,
                 hosting at the data center with the least delay, the
                 potential cost savings of DAHM reaches 33\%. The
                 savings come from reducing the total number of active
                 servers as well as leveraging the cost efficiency of
                 data centers. Through the simulation study, the article
                 further explores how relaxing the delay requirement for
                 a small fraction of users can increase the cost savings
                 of DAHM.",
  acknowledgement = ack-nhfb,
  articleno =    "34",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Srinivasan:2013:NAF,
  author =       "S. Srinivasan and V. Kamakoti and A. Bhattacharya",
  title =        "A Novel Algorithm for Fast Synthesis of {DNA} Probes
                 on Microarrays",
  journal =      j-JETC,
  volume =       "9",
  number =       "1",
  pages =        "1:1--1:??",
  month =        feb,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2422094.2422095",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Wed Feb 20 16:42:57 MST 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "DNA microarrays are used extensively for biochemical
                 analysis that includes genomics and drug discovery.
                 This increased usage demands large microarrays, thus
                 complicating their computer aided design (CAD) and
                 manufacturing methodologies. One such time-consuming
                 design problem is to minimize the border length of
                 masks used during the manufacture of microarrays. From
                 the manufacturing point of view the border length of
                 masks is one of the crucial parameters determining the
                 reliability of the microarray. This article presents a
                 novel algorithm for synthesis (placement and embedding)
                 of microarrays, which consumes significantly less time
                 than the best algorithm reported in the literature,
                 while maintaining the quality (border length of masks)
                 of the result. The proposed technique uses only a part
                 of each probe to decide on the placement and the
                 remaining parts for deciding on the embedding sequence.
                 This is in contrast to the earlier methods that
                 considered the entire probe for both placement and
                 embedding. The second novelty of the proposed technique
                 is the preclassification (prior to placement and
                 embedding) of probes based on their prefixes. This
                 decreases the complexity of the problem of deciding the
                 next probe to be placed from that involving computation
                 of Hamming distance between all probes (as used in
                 earlier approaches) to the one involving searching of
                 nonempty cells on a constant size grid array. The
                 proposed algorithm is $ 43 \times $ faster than the
                 best reported in the literature for the case of
                 synthesizing a microarray with 250,000 probes and
                 further exhibits linear behavior in terms of
                 computation time for larger microarrays.",
  acknowledgement = ack-nhfb,
  articleno =    "1",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Maftei:2013:MBS,
  author =       "Elena Maftei and Paul Pop and Jan Madsen",
  title =        "Module-Based Synthesis of Digital Microfluidic
                 Biochips with Droplet-Aware Operation Execution",
  journal =      j-JETC,
  volume =       "9",
  number =       "1",
  pages =        "2:1--2:??",
  month =        feb,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2422094.2422096",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Wed Feb 20 16:42:57 MST 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Microfluidic biochips represent an alternative to
                 conventional biochemical analyzers. A digital biochip
                 manipulates liquids not as continuous flow, but as
                 discrete droplets on a two-dimensional array of
                 electrodes. Several electrodes are dynamically grouped
                 to form a virtual device, on which operations are
                 executed by moving the droplets. So far, researchers
                 have ignored the locations of droplets inside devices,
                 considering that all the electrodes forming the device
                 are occupied throughout the operation execution. In
                 this article, we consider a droplet-aware execution of
                 microfluidic operations, which means that we know the
                 exact position of droplets inside the modules at each
                 time-step. We propose a Tabu Search-based metaheuristic
                 for the synthesis of digital biochips with
                 droplet-aware operation execution. Experimental results
                 show that our approach can significantly reduce the
                 application completion time, allowing us to use smaller
                 area biochips and thus reduce costs.",
  acknowledgement = ack-nhfb,
  articleno =    "2",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Peper:2013:BCF,
  author =       "Ferdinand Peper and Jia Lee and Josep Carmona and
                 Jordi Cortadella and Kenichi Morita",
  title =        "{Brownian} Circuits: Fundamentals",
  journal =      j-JETC,
  volume =       "9",
  number =       "1",
  pages =        "3:1--3:??",
  month =        feb,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2422094.2422097",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Wed Feb 20 16:42:57 MST 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Random fluctuations will be a major factor interfering
                 with the operation of nanometer scale electronic
                 devices. This article presents circuit architectures
                 that can exploit such fluctuations, if signals have a
                 particle-like (discrete, token-based) character. We
                 define an abstract circuit primitive that, though
                 lacking functionality when used with fluctuation-free
                 signals, becomes universal when fluctuations are
                 allowed. Key to the power of a signal's fluctuations is
                 the ability to explore the state space of a circuit.
                 This ability is used to resolve deadlock situations,
                 which could otherwise only be averted by increased
                 design complexity. The results in this article suggest
                 that in the design of future computers, signal
                 fluctuations, rather than being an impediment to be
                 avoided at any cost, may be an important ingredient to
                 achieve efficient operation.",
  acknowledgement = ack-nhfb,
  articleno =    "3",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Ghavami:2013:DAR,
  author =       "Behnam Ghavami and Mohsen Raji and Hossein Pedram and
                 Mehdi B. Tahoori",
  title =        "Design and Analysis of a Robust Carbon Nanotube-Based
                 Asynchronous Primitive Circuit",
  journal =      j-JETC,
  volume =       "9",
  number =       "1",
  pages =        "4:1--4:??",
  month =        feb,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2422094.2422098",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Wed Feb 20 16:42:57 MST 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Carbon Nanotube Field Effect Transistors (CNFETs) show
                 great promise as extensions to silicon CMOS. However,
                 CNFET-based circuits will face great fabrication
                 challenges that will translate into important parameter
                 variations and decreased reliability. Hence,
                 asynchronous logic, which is intrinsically more robust
                 to variability, seems an ideal and perhaps unavoidable
                 choice for digital circuits in CNFET technology. This
                 article presents the results on the design and analysis
                 of a CNFET-based implementation of an asynchronous
                 circuit primitive: the Muller C-element. Using a CNFET
                 SPICE model, we evaluate the robustness of CNFET-based
                 C-element in the presence of CNT fabrication-related
                 nonidealities. We investigate a quantitative evaluation
                 of how timing variability impacts the functionality of
                 a C-element and then, extract the necessary delay
                 constraints of the C-element circuit from the signal
                 transition graph specification. Considering the large
                 degrees of spatial correlation observed between the
                 CNFETs fabricated on directionally grown CNTs, a layout
                 technique is exploited to overcome the robustness
                 challenges of a CNFET-based C-element. Extensive Monte
                 Carlo simulations on the proposed technique have
                 demonstrated the effectiveness of the proposed
                 CNFET-based C-element by improving approximately 50X in
                 its robustness in expense of 65\% area, 47\% delay, and
                 56\% power consumption overheads. Experimental results
                 indicate that implementation of some CNFET-based Quasi
                 Delay Insensitive (QDI) benchmark circuits using the
                 proposed C-element results in significant robustness
                 improvement with negligible power and throughput
                 overheads. As a promising step toward CNFET-based
                 giga-scale integrated circuits, this article shows that
                 the asynchronous logic is an effective approach to
                 design robust integrated circuits in CNFET technology
                 with inherent extreme physical variations.",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Chen:2013:SAR,
  author =       "Yung-Chih Chen and Soumya Eachempati and Chun-Yao Wang
                 and Suman Datta and Yuan Xie and Vijaykrishnan
                 Narayanan",
  title =        "A Synthesis Algorithm for Reconfigurable
                 Single-Electron Transistor Arrays",
  journal =      j-JETC,
  volume =       "9",
  number =       "1",
  pages =        "5:1--5:??",
  month =        feb,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2422094.2422099",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Wed Feb 20 16:42:57 MST 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Reducing power consumption has become one of the
                 primary challenges in chip design, and therefore
                 significant efforts are being devoted to find holistic
                 solutions on power reduction from the device level up
                 to the system level. Among a plethora of low power
                 devices that are being explored, single-electron
                 transistors (SETs) at room temperature are particularly
                 attractive. Although prior work has proposed a binary
                 decision diagram-based reconfigurable logic
                 architecture using SETs, it lacks an automatic
                 synthesis algorithm for the architecture. Consequently,
                 in this work, we develop a product-term-based approach
                 that synthesizes a logic circuit by mapping all its
                 product terms into the SET architecture. The
                 experimental results show the effectiveness and
                 efficiency of the proposed approach on a set of MCNC
                 benchmarks.",
  acknowledgement = ack-nhfb,
  articleno =    "5",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Tang:2013:TCT,
  author =       "Aoxiang Tang and Niraj K. Jha",
  title =        "Thermal Characterization of Test Techniques for
                 {FinFET} and {$3$D} Integrated Circuits",
  journal =      j-JETC,
  volume =       "9",
  number =       "1",
  pages =        "6:1--6:??",
  month =        feb,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2422094.2422100",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Wed Feb 20 16:42:57 MST 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Power consumption has become a very important
                 consideration during integrated circuit (IC) design and
                 test. During test, it can far exceed the values reached
                 during normal operation and, thus, lead to temperatures
                 above the allowed threshold. Without appropriate
                 temperature reduction, permanent damage may be caused
                 to the IC or invalid test results may be obtained.
                 FinFET is a double-gate field-effect transistor
                 (DG-FET) that was introduced commercially in 2012. Due
                 to the vertical nature of FinFETs and, hence, weaker
                 ability to dissipate heat, this problem is likely to
                 get worse for FinFET circuits. Another technology
                 rapidly gaining popularity is 3D IC integration.
                 Unfortunately, the compact nature of a multidie 3D IC
                 is likely to aggravate the temperature-during-test
                 problem even further. Hence, before temperature-aware
                 test methodologies can be developed, it is important to
                 thermally analyze both FinFET and 3D circuits under
                 test. In this article, we present a methodology for
                 thermal characterization of various test techniques,
                 such as scan and built-in self-test (BIST), for FinFET
                 and 3D ICs. FinFET thermal characterization makes use
                 of a FinFET standard cell library that is characterized
                 with the help of the University of Florida double-gate
                 (UFDG) SPICE model. Thermal profiles for circuits under
                 test are produced by ISAC2 from University of Colorado
                 for FinFET circuits and HotSpot from University of
                 Virginia for 3D ICs. Experimental results indicate that
                 high temperatures result under BIST and much less often
                 under scan, and that both power consumption and test
                 application time should be reduced to lower the
                 temperature of circuits under test, just reducing the
                 power consumption is not enough.",
  acknowledgement = ack-nhfb,
  articleno =    "6",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Wang:2013:HRD,
  author =       "Shuo Wang and Jianwei Dai and Lei Wang",
  title =        "Hybrid Redundancy for Defect Tolerance in Molecular
                 Crossbar Memory",
  journal =      j-JETC,
  volume =       "9",
  number =       "1",
  pages =        "7:1--7:??",
  month =        feb,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2422094.2422101",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Wed Feb 20 16:42:57 MST 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Nano/molecular technologies have emerged as the
                 potential fabrics for building future integrated
                 systems. However, due to the imperfect fabrication
                 process, these extremely scaled devices are vulnerable
                 to a large number of defects and transient faults.
                 Memory systems, which are the primary application
                 targeted by these technologies, are particularly
                 exposed to this problem due to the ultra-high
                 integration density and elevated error sensitivity. In
                 this article, we propose a defect-tolerant technique,
                 referred to as hybrid redundancy allocation, for the
                 design of molecular crossbar memory systems. By using
                 soft redundancy (runtime exploitation of memory
                 spatial/temporal locality) in combination with hardware
                 redundancy (spare memory cells), the proposed technique
                 can achieve better error management at a low cost as
                 compared with conventional techniques. Simulation
                 results demonstrate the significant improvement in
                 defect tolerance, efficiency, and scalability of the
                 proposed technique.",
  acknowledgement = ack-nhfb,
  articleno =    "7",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Narayanan:2013:VNF,
  author =       "Pritish Narayanan and Michael Leuchtenburg and Jorge
                 Kina and Prachi Joshi and Pavan Panchapakeshan and Chi
                 On Chui and C. Andras Moritz",
  title =        "Variability in Nanoscale Fabrics: Bottom-up Integrated
                 Analysis and Mitigation",
  journal =      j-JETC,
  volume =       "9",
  number =       "1",
  pages =        "8:1--8:??",
  month =        feb,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2422094.2422102",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Wed Feb 20 16:42:57 MST 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Emerging nanodevice-based architectures will be
                 impacted by parameter variation in conjunction with
                 high defect rates. Variations in key physical
                 parameters are caused by manufacturing imprecision as
                 well as fundamental atomic scale randomness. In this
                 article, the impact of parameter variation on nanoscale
                 computing fabrics is extensively studied through a
                 novel integrated methodology across device, circuit and
                 architectural levels. This integrated approach enables
                 to study in detail the impact of physical parameter
                 variation across all fabric layers. A final
                 contribution of the article includes novel techniques
                 to address this impact. The variability framework,
                 while generic, is explored extensively on the Nanoscale
                 Application Specific Integrated Circuits (NASICs)
                 nanowire fabric. For variation of $ \sigma = 10 $ in
                 key physical parameters, the on current is found to
                 vary by up to 3.5X. Circuit-level delay shows up to
                 118\% deviation from nominal. Monte Carlo simulations
                 using an architectural simulator found 67\%
                 nanoprocessor chips to operate below nominal
                 frequencies due to variation. New built-in variation
                 mitigation and fault-tolerance schemes, leveraging
                 redundancy, asymmetric delay paths and biased voting
                 schemes, were developed and evaluated to mitigate these
                 effects. They are shown to improve performance by up to
                 7.5X on a nanoscale processor design with variation,
                 and improve performance in designs relying on
                 redundancy for defect tolerance, without variation
                 assumed. Techniques show up to 3.8X improvement in
                 effective-yield performance products even at a high
                 12\% defect rate. The suite of techniques provides a
                 design space across key system-level metrics such as
                 performance, yield and area.",
  acknowledgement = ack-nhfb,
  articleno =    "8",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Liang:2013:EWB,
  author =       "Jiale Liang and Stanley Yeh and S. Simon Wong and
                 H.-S. Philip Wong",
  title =        "Effect of Wordline\slash Bitline Scaling on the
                 Performance, Energy Consumption, and Reliability of
                 Cross-Point Memory Array",
  journal =      j-JETC,
  volume =       "9",
  number =       "1",
  pages =        "9:1--9:??",
  month =        feb,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2422094.2422103",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Wed Feb 20 16:42:57 MST 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "The impact of wordline/bitline metal wire scaling on
                 the write/read performance, energy consumption, speed,
                 and reliability of the cross-point memory array is
                 quantitatively studied for technology nodes down to
                 single-digit nm. The impending resistivity increase in
                 the Cu wires is found to cause significant decrease of
                 both write and read window margins at the regime when
                 electron surface scattering and grain boundary
                 scattering are substantial. At deeply-scaled device
                 dimensions, the wire energy dissipation and wire
                 latency become comparable to or even exceed the
                 intrinsic values of memory cells. The large current
                 density flowing through the wordlines/bitlines raises
                 additional reliability concerns for the cross-point
                 memory array. All these issues are exacerbated at
                 smaller memory resistance values and larger memory
                 array sizes. They thereby impose strict constraints on
                 the memory device design and preclude the realization
                 of large-scale cross-point memory array with minimum
                 feature sizes beyond the 10 nm node. A rethink in the
                 design methodology of cross-point memory to incorporate
                 and mitigate the scaling effects of wordline/bitline is
                 necessary. Possible solutions include the use of memory
                 wires with better conductivity and scalability, memory
                 arrays with smaller partition sizes, and memory
                 elements with larger resistance values and resistance
                 ratios.",
  acknowledgement = ack-nhfb,
  articleno =    "9",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Paul:2013:ISI,
  author =       "Bipul C. Paul and Arijit Raychowdhury",
  title =        "Introduction to the special issue on memory
                 technologies",
  journal =      j-JETC,
  volume =       "9",
  number =       "2",
  pages =        "10:1--10:??",
  month =        may,
  year =         "2013",
  CODEN =        "????",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Sat Jun 1 11:19:09 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  acknowledgement = ack-nhfb,
  articleno =    "10",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Yang:2013:MDC,
  author =       "J. Joshua Yang and R. Stanley Williams",
  title =        "Memristive devices in computing system: Promises and
                 challenges",
  journal =      j-JETC,
  volume =       "9",
  number =       "2",
  pages =        "11:1--11:??",
  month =        may,
  year =         "2013",
  CODEN =        "????",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Sat Jun 1 11:19:09 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Memristive devices with a simple structure are not
                 only very small but also very versatile, which makes
                 them an ideal candidate used for the next generation
                 computing system in the post-Si era. The working
                 mechanism of the devices and a family of nanodevices
                 built based on this working mechanism are introduced
                 first followed by some proposed applications of these
                 novel devices. The promises and challenges of these
                 devices are then discussed, together with the
                 significant progresses made recently in dealing with
                 these challenges.",
  acknowledgement = ack-nhfb,
  articleno =    "11",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Jackson:2013:NES,
  author =       "Bryan L. Jackson and Bipin Rajendran and Gregory S.
                 Corrado and Matthew Breitwisch and Geoffrey W. Burr and
                 Roger Cheek and Kailash Gopalakrishnan and Simone Raoux
                 and Charles T. Rettner and Alvaro Padilla and Alex G.
                 Schrott and Rohit S. Shenoy and B{\"u}lent N. Kurdi and
                 Chung H. Lam and Dharmendra S. Modha",
  title =        "Nanoscale electronic synapses using phase change
                 devices",
  journal =      j-JETC,
  volume =       "9",
  number =       "2",
  pages =        "12:1--12:??",
  month =        may,
  year =         "2013",
  CODEN =        "????",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Sat Jun 1 11:19:09 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "The memory capacity, computational power,
                 communication bandwidth, energy consumption, and
                 physical size of the brain all tend to scale with the
                 number of synapses, which outnumber neurons by a factor
                 of 10,000. Although progress in cortical simulations
                 using modern digital computers has been rapid, the
                 essential disparity between the classical von Neumann
                 computer architecture and the computational fabric of
                 the nervous system makes large-scale simulations
                 expensive, power hungry, and time consuming. Over the
                 last three decades, CMOS-based neuromorphic
                 implementations of ``electronic cortex'' have emerged
                 as an energy efficient alternative for modeling
                 neuronal behavior. However, the key ingredient for
                 electronic implementation of any self-learning
                 system-programmable, plastic Hebbian synapses scalable
                 to biological densities-has remained elusive. We
                 demonstrate the viability of implementing such
                 electronic synapses using nanoscale phase change
                 devices. We introduce novel programming schemes for
                 modulation of device conductance to closely mimic the
                 phenomenon of Spike Timing Dependent Plasticity (STDP)
                 observed biologically, and verify through simulations
                 that such plastic phase change devices should support
                 simple correlative learning in networks of spiking
                 neurons. Our devices, when arranged in a crossbar array
                 architecture, could enable the development of
                 synaptronic systems that approach the density ($
                 \approx 10^{11} $ synapses per sq cm) and energy
                 efficiency (consuming $ \approx 1 $ pJ per synaptic
                 programming event) of the human brain.",
  acknowledgement = ack-nhfb,
  articleno =    "12",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Apalkov:2013:STT,
  author =       "Dmytro Apalkov and Alexey Khvalkovskiy and Steven
                 Watts and Vladimir Nikitin and Xueti Tang and Daniel
                 Lottis and Kiseok Moon and Xiao Luo and Eugene Chen and
                 Adrian Ong and Alexander Driskill-Smith and Mohamad
                 Krounbi",
  title =        "Spin-transfer torque magnetic random access memory
                 {(STT-MRAM)}",
  journal =      j-JETC,
  volume =       "9",
  number =       "2",
  pages =        "13:1--13:??",
  month =        may,
  year =         "2013",
  CODEN =        "????",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Sat Jun 1 11:19:09 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Spin-transfer torque magnetic random access memory
                 (STT-MRAM) is a novel, magnetic memory technology that
                 leverages the base platform established by an existing
                 100+nm node memory product called MRAM to enable a
                 scalable nonvolatile memory solution for advanced
                 process nodes. STT-MRAM features fast read and write
                 times, small cell sizes of 6F$^2$ and potentially even
                 smaller, and compatibility with existing DRAM and SRAM
                 architecture with relatively small associated cost
                 added. STT-MRAM is essentially a magnetic multilayer
                 resistive element cell that is fabricated as an
                 additional metal layer on top of conventional CMOS
                 access transistors. In this review we give an overview
                 of the existing STT-MRAM technologies currently in
                 research and development across the world, as well as
                 some specific discussion of results obtained at Grandis
                 and with our foundry partners. We will show that
                 in-plane STT-MRAM technology, particularly the DMTJ
                 design, is a mature technology that meets all
                 conventional requirements for an STT-MRAM cell to be a
                 nonvolatile solution matching DRAM and/or SRAM drive
                 circuitry. Exciting recent developments in
                 perpendicular STT-MRAM also indicate that this type of
                 STT-MRAM technology may reach maturity faster than
                 expected, allowing even smaller cell size and product
                 introduction at smaller nodes.",
  acknowledgement = ack-nhfb,
  articleno =    "13",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Mojumder:2013:DPS,
  author =       "Niladri N. Mojumder and Xuanyao Fong and Charles
                 Augustine and Sumeet K. Gupta and Sri Harsha Choday and
                 Kaushik Roy",
  title =        "Dual pillar spin-transfer torque {MRAMs} for low power
                 applications",
  journal =      j-JETC,
  volume =       "9",
  number =       "2",
  pages =        "14:1--14:??",
  month =        may,
  year =         "2013",
  CODEN =        "????",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Sat Jun 1 11:19:09 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Electron-spin based data storage for on-chip memories
                 has the potential for ultra-high density, low power
                 consumption, very high endurance, and reasonably low
                 read/write latency. In this article, we discuss the
                 design challenges associated with spin-transfer torque
                 (STT) MRAM in its state-of-the-art configuration. We
                 propose an alternative bit cell configuration and three
                 new genres of magnetic tunnel junction (MTJ) structures
                 to improve STT-MRAM bit cell stabilities, write
                 endurance, and reduce write energy consumption. The
                 proposed multi-port, multi-pillar MTJ structures offer
                 the unique possibility of electrical and spatial
                 isolation of memory read and write. In order to realize
                 ultralow power under process variations, we propose
                 device, bit-cell and architecture level design
                 techniques. Such design alternatives at multiple levels
                 of design abstraction has been found to achieve
                 substantially enhanced robustness, density, reliability
                 and low power as compared to their charge-based
                 counterparts for future embedded applications.",
  acknowledgement = ack-nhfb,
  articleno =    "14",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Chatterjee:2013:EAS,
  author =       "Subho Chatterjee and Sayeef Salahuddin and Satish
                 Kumar and Saibal Mukhopadhyay",
  title =        "Electrothermal analysis of spin-transfer-torque random
                 access memory arrays",
  journal =      j-JETC,
  volume =       "9",
  number =       "2",
  pages =        "15:1--15:??",
  month =        may,
  year =         "2013",
  CODEN =        "????",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Sat Jun 1 11:19:09 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Spin Transfer Torque RAM (STTRAM) is a promising
                 candidate for fast, scalable, high-density, nonvolatile
                 memory in nanometer technology. However, relatively
                 high write current density and small volume of the
                 memory device indicate the possibility of significant
                 self-heating in the STTRAM structure. This article
                 performs a critical analysis of the self-heating
                 induced temperature variations in STTRAM. We perform a
                 3D finite volume method based study to characterize
                 self-heating effect in a single cell. The analysis is
                 extended for STTRAM arrays by developing a
                 computationally efficient RC compact model based
                 thermal analyzer. The analysis shows that self-heating
                 can results in considerable increase in both
                 steady-state value and transient change in temperature
                 of individual cells. The effect is less pronounced at
                 the array level and depends on the activity level, that
                 is, number of active cells within an array size. The
                 analysis further illustrates that self-heating
                 negatively impacts electrical reliability metrics
                 namely, read margin and detection accuracy; degrades
                 cell performance; and modulates energy dissipation.",
  acknowledgement = ack-nhfb,
  articleno =    "15",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Chen:2013:CCB,
  author =       "Yiran Chen and Weng-Fai Wong and Hai Li and Cheng-Kok
                 Koh and Yaojun Zhang and Wujie Wen",
  title =        "On-chip caches built on multilevel spin-transfer
                 torque {RAM} cells and its optimizations",
  journal =      j-JETC,
  volume =       "9",
  number =       "2",
  pages =        "16:1--16:??",
  month =        may,
  year =         "2013",
  CODEN =        "????",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Sat Jun 1 11:19:09 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "It has been predicted that a processor's caches could
                 occupy as much as 90\% of chip area a few technology
                 nodes from the current ones. In this article, we
                 investigate the use of multilevel spin-transfer torque
                 RAM (STT-RAM) cells in the design of processor caches.
                 We start with examining the access (read and write)
                 scheme for multilevel cell (MLC) STT-RAM from a circuit
                 design perspective, detailing the read and write
                 circuits. Compared to traditional SRAM caches, a
                 multilevel cell (MLC) STT-RAM cache design is denser,
                 fast, and requires less energy. However, a number of
                 critical architecture-level issues remain to be solved
                 before MLC STT-RAM technology can be deployed in
                 processor caches. We shall offer solutions to the issue
                 of bit encoding as well as tackle the write endurance
                 problem. In particular, the latter has been neglected
                 in previous works on STT-RAM caches. We propose a set
                 remapping scheme that can potentially prolong the
                 lifetime of a MLC STT-RAM cache by 80$ \times $ on
                 average. Furthermore, a method for recovering the
                 performance that may be lost in some applications due
                 to set remapping is proposed. The impacts of process
                 variations of the MLC STT-RAM cell on the robustness of
                 the memory hierarchy is also discussed, together with
                 various enhancement techniques, namely, ECC and design
                 redundancy.",
  acknowledgement = ack-nhfb,
  articleno =    "16",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Thapliyal:2013:DER,
  author =       "Himanshu Thapliyal and Nagarajan Ranganathan",
  title =        "Design of efficient reversible logic-based binary and
                 {BCD} adder circuits",
  journal =      j-JETC,
  volume =       "9",
  number =       "3",
  pages =        "17:1--17:??",
  month =        sep,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2491682",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Tue Oct 1 18:20:25 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Reversible logic is gaining significance in the
                 context of emerging technologies such as quantum
                 computing since reversible circuits do not lose
                 information during computation and there is one-to-one
                 mapping between the inputs and outputs. In this work,
                 we present a class of new designs for reversible binary
                 and BCD adder circuits. The proposed designs are
                 primarily optimized for the number of ancilla inputs
                 and the number of garbage outputs and are designed for
                 possible best values for the quantum cost and delay. In
                 reversible circuits, in addition to the primary inputs,
                 some constant input bits are used to realize different
                 logic functions which are referred to as ancilla inputs
                 and are overheads that need to be reduced. Further, the
                 garbage outputs which do not contribute to any useful
                 computations but are needed to maintain reversibility
                 are also overheads that need to be reduced in
                 reversible designs. First, we propose two new designs
                 for the reversible ripple carry adder: (i) one with no
                 input carry$ c_0 $ and no ancilla input bits, and (ii)
                 one with input carry$ c_0 $ and no ancilla input bits.
                 The proposed reversible ripple carry adder designs with
                 no ancilla input bits have less quantum cost and logic
                 depth (delay) compared to their existing counterparts
                 in the literature. In these designs, the quantum cost
                 and delay are reduced by deriving designs based on the
                 reversible Peres gate and the TR gate. Next, four new
                 designs for the reversible BCD adder are presented
                 based on the following two approaches: (i) the addition
                 is performed in binary mode and correction is applied
                 to convert to BCD when required through detection and
                 correction, and (ii) the addition is performed in
                 binary mode and the result is always converted using a
                 binary to BCD converter. The proposed reversible binary
                 and BCD adders can be applied in a wide variety of
                 digital signal processing applications and constitute
                 important design components of reversible computing.",
  acknowledgement = ack-nhfb,
  articleno =    "17",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Lee:2013:CIP,
  author =       "Woo Hyung Lee and Pinaki Mazumder",
  title =        "Color image processing with multi-peak resonant
                 tunneling diodes",
  journal =      j-JETC,
  volume =       "9",
  number =       "3",
  pages =        "18:1--18:??",
  month =        sep,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2503128",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Tue Oct 1 18:20:25 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "The article introduces a novel approach to color image
                 processing that utilizes multi-peak resonant tunneling
                 diodes for encoding color information in quantized
                 states of the diodes. The Multi-Peak Resonant Tunneling
                 Diodes (MPRTDs) are organized as a two-dimensional
                 array of vertical pillars which are locally connected
                 by programmable passive and active elements with a view
                 to realizing a wide variety of color image processing
                 functions such as quantization, color extraction, image
                 smoothing, edge detection, and line detection. In order
                 to process color information in the input images, two
                 different methods for color representation schemes have
                 been used: one using color mapping and the other using
                 direct RGB representation. Finally, the article uses
                 HSPICE simulation methods for the nestlist of the
                 proposed RTD-based nanoarchitecture in order to verify
                 a candidate of image functions by using the
                 afore-mentioned representation methods.",
  acknowledgement = ack-nhfb,
  articleno =    "18",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Bobba:2013:CTP,
  author =       "Shashikanth Bobba and Ashutosh Chakraborty and Olivier
                 Thomas and Perrine Batude and Giovanni de Micheli",
  title =        "Cell transformations and physical design techniques
                 for {$3$D} monolithic integrated circuits",
  journal =      j-JETC,
  volume =       "9",
  number =       "3",
  pages =        "19:1--19:??",
  month =        sep,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2491675",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Tue Oct 1 18:20:25 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "3D Monolithic Integration (3DMI), also termed as
                 sequential integration, is a potential technology for
                 future gigascale circuits. In 3DMI technology the 3D
                 contacts, connecting different active layers, are in
                 the order of few 100nm. Given the advantage of such
                 small contacts, 3DMI enables fine-grain (gate-level)
                 partitioning of circuits. In this work we present three
                 cell transformation techniques for standard cell-based
                 ICs with 3DMI technology. As a major contribution of
                 this work, we propose a design flow comprising of a
                 cell transformation technique, cell-on-cell stacking,
                 and a physical design technique ({CELONCEL$_{PD}$} )
                 aimed at placing cells transformed with cell-on-cell
                 stacking. We analyze and compare various cell
                 transformation techniques for 3DMI technology without
                 disrupting the regularity of the IC design flow. Our
                 experiments demonstrate the effectiveness of CELONCEL
                 design technique, yielding us an area reduction of
                 37.5\%, 16.2\% average reduction in wirelength, and
                 6.2\% average improvement in overall delay, compared
                 with a 2D case when benchmarked across various designs
                 in 45nm technology node.",
  acknowledgement = ack-nhfb,
  articleno =    "19",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Tang:2013:DSE,
  author =       "Aoxiang Tang and Niraj K. Jha",
  title =        "Design space exploration of {FinFET} cache",
  journal =      j-JETC,
  volume =       "9",
  number =       "3",
  pages =        "20:1--20:??",
  month =        sep,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2491678",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Tue Oct 1 18:20:25 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Integration of cache on-chip has significantly
                 improved the performance of modern processors. The
                 relentless demand for ever-increasing performance has
                 led to the need to increase the cache capacity and
                 number of cache levels. However, the performance
                 improvement is accompanied by an increase in chip's
                 power dissipation, requiring the use of more expensive
                 cooling technologies to ensure chip reliability and
                 long product life. The emergence of FinFETs as the
                 technology of choice for high-performance computing
                 poses new challenges to processor designers. With the
                 introduction of new features in FinFETs, for example,
                 independently controllable back gates, researchers have
                 proposed several innovative memory cells that can
                 reduce leakage power significantly, making the
                 integration of a larger cache more practical. In this
                 article, we comprehensively evaluate and compare the
                 performance, power consumption (both dynamic and
                 leakage), area, and temperature of different FinFET
                 SRAM caches by exploring common configurations with
                 varying cache size, block size, associativity, and
                 number of banks. We evaluate caches based on four
                 well-known FinFET SRAM cells: Pass-Gate FeedBack
                 (PGFB), Row-based Back-Gate Biasing (RBGB), 8T, and 4T.
                 We show how the caches can be simulated at
                 self-consistent temperatures (at which leakage and
                 temperature are in equilibrium). Drowsy and decay
                 caches are two well-known leakage reduction techniques.
                 We implement them in the context of FinFET caches to
                 investigate their impact. We show that the RBGB
                 cell-based cache is far superior in leakage and
                 Power-Delay Product (PDP) to those based on the other
                 three cells, sometimes by an order of magnitude. This
                 superiority is maintained even when drowsy or decay
                 leakage reduction techniques are applied to caches
                 based on the other three cells, but not to the one
                 based on the RBGB cell. This significantly diminishes
                 the importance of drowsy or decay cache techniques, at
                 least when the RBGB cell is used.",
  acknowledgement = ack-nhfb,
  articleno =    "20",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Zamani:2013:IFV,
  author =       "Masoud Zamani and Hanieh Mirzaei and Mehdi B.
                 Tahoori",
  title =        "{ILP} formulations for variation\slash defect-tolerant
                 logic mapping on crossbar nano-architectures",
  journal =      j-JETC,
  volume =       "9",
  number =       "3",
  pages =        "21:1--21:??",
  month =        sep,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2491680",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Tue Oct 1 18:20:25 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Several emerging nano-technologies, including crossbar
                 nano-architectures, have recently been studied as
                 possible replacement or supplement to CMOS technology
                 in the future. However, extreme process variation and
                 high failure rates, mainly due to atomic device sizes,
                 are major challenges for crossbar nano-architectures.
                 This article presents variation- and defect-tolerant
                 logic mapping on crossbar nano-architectures. Since
                 variation/defect-aware mapping is an NP-hard problem,
                 we introduce a set of Integer Linear Programming (ILP)
                 formulations to effectively solve the problem in a
                 reasonable time. The proposed ILP formulations can be
                 used for both diode-based and FET-based crossbars.
                 Experimental results on benchmark circuits show that
                 our approach can reduce the critical-path delay 39\%
                 compared to the Simulated Annealing (SA) method. It can
                 also successfully achieve 97\% defect-free mapping with
                 40\% defect density. It can tolerate process variations
                 to meet timing constraints in 95\% of the cases,
                 compared to only 77\% achieved by SA.",
  acknowledgement = ack-nhfb,
  articleno =    "21",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Sun:2013:EVC,
  author =       "Guangyu Sun and Eren Kursun and Jude A. Rivers and
                 Yuan Xie",
  title =        "Exploring the vulnerability of {CMPs} to soft errors
                 with {$3$D} stacked nonvolatile memory",
  journal =      j-JETC,
  volume =       "9",
  number =       "3",
  pages =        "22:1--22:??",
  month =        sep,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2491679",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Tue Oct 1 18:20:25 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Improving the vulnerability to soft errors is one of
                 the important design goals for future architecture
                 design of Chip-MultiProcessors (CMPs). In this study,
                 we explore the soft error characteristics of CMPs with
                 3D stacked NonVolatile Memory (NVM), in particular, the
                 Spin-Transfer Torque Random Access Memory (STT-RAM),
                 whose cells are immune to radiation-induced soft errors
                 and do not have endurance problems. We use 3D stacking
                 as an enabler for modular integration of STT-RAM
                 memories with minimum disruption in the baseline
                 processor design flow, while providing further
                 interconnection and capacity advantages. We take an
                 in-depth look at alternative replacement schemes to
                 explore the soft error resilience benefits and design
                 trade-offs of 3D stacked STT-RAM and capture the
                 multivariable optimization challenges microprocessor
                 architectures face. We propose a vulnerability metric,
                 with respect to the instruction and data in the core
                 pipeline and through the cache hierarchy, to present a
                 comprehensive system evaluation with respect to
                 reliability, performance, and power consumption for our
                 CMP architectures. Our experimental results show that,
                 for the average workload, replacing memories with an
                 STT-RAM alternative significantly mitigates soft errors
                 on-chip, improves the performance by 14.15\%, and
                 reduces power consumption by 13.44\%.",
  acknowledgement = ack-nhfb,
  articleno =    "22",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Yang:2013:NAC,
  author =       "Shengqi Yang and Wenping Wang and Mark Hagan and Wei
                 Zhang and Pallav Gupta and Yu Cao",
  title =        "{NBTI}-aware circuit node criticality computation",
  journal =      j-JETC,
  volume =       "9",
  number =       "3",
  pages =        "23:1--23:??",
  month =        sep,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2491681",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Tue Oct 1 18:20:25 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "For sub-65nm technology nodes, Negative Bias
                 Temperature Instability (NBTI) has become a primary
                 limiting factor of circuit lifetime. During the past
                 few years, researchers have spent considerable effort
                 on accurate modeling and characterization of circuit
                 delay degradation caused by NBTI at different design
                 levels. The search for techniques and methodologies
                 which can aid in effectively minimizing the NBTI effect
                 on circuit delay is still underway. In this work, we
                 present the usage of node criticality computation to
                 drive NBTI-aware timing analysis and optimization.
                 Circuits that have undergone this optimization flow
                 show strong resistance to NBTI delay degradation. For
                 the first time, this work proposes a node criticality
                 computation algorithm under an NBTI-aware timing
                 analysis and optimization framework. Our work provides
                 answers to the following yet unaddressed questions: (a)
                 what is the definition of node criticality in a circuit
                 under the NBTI effect? (b) how do we identify the
                 critical nodes that, once protected, will be immune to
                 NBTI timing degradation? and (c) what are the NBTI
                 effect attenuation approaches? Experimental results
                 indicate that by protecting the critical nodes found by
                 our proposed methodology, circuit delay degradation can
                 be reduced by up to 50\%. Combined with peak
                 temperature reduction, the delay degradation can be
                 further improved.",
  acknowledgement = ack-nhfb,
  articleno =    "23",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Wettin:2013:CNE,
  author =       "Paul Wettin and Anuroop Vidapalapati and Amlan Gangul
                 and Partha Pratim Pande",
  title =        "Complex network-enabled robust wireless
                 network-on-chip architectures",
  journal =      j-JETC,
  volume =       "9",
  number =       "3",
  pages =        "24:1--24:??",
  month =        sep,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2491676",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Tue Oct 1 18:20:25 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "The Network-on-Chip (NoC) paradigm has emerged as a
                 scalable interconnection infrastructure for modern
                 multicore chips. However, with growing levels of
                 integration, the traditional NoCs suffer from high
                 latency and energy dissipation in on-chip data transfer
                 due to conventional multihop metal/dielectric-based
                 interconnects. Three-dimensional integration, on-chip
                 photonics, RF, and wireless links have been proposed as
                 radical low-power and low-latency alternatives to the
                 conventional planar wire-based designs. Wireless NoCs
                 with Carbon NanoTube (CNT) antennas are shown to
                 outperform traditional wire-based NoCs significantly in
                 achievable data rate and energy dissipation. However,
                 such emerging and transformative technologies will be
                 prone to high levels of failures due to various issues
                 related to manufacturing challenges and integration. On
                 the other hand, several naturally occurring complex
                 networks such as colonies of microbes and the World
                 Wide Web are known to be inherently robust against high
                 rates of failures and harsh environments. This article
                 advocates adoption of such complex network-based
                 architectures to minimize the effect of wireless link
                 failures on the performance of the NoC. Through
                 cycle-accurate simulations it is shown that the
                 wireless NoC architectures inspired by natural complex
                 networks perform better than their conventional wired
                 counterparts even in the presence of high degrees of
                 link failures. We demonstrate the robustness of the
                 proposed wireless NoC architecture by incorporating
                 both uniform and application-specific traffic
                 patterns.",
  acknowledgement = ack-nhfb,
  articleno =    "24",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Zhang:2013:DTU,
  author =       "Xuehui Zhang and Andrew Ferraiuolo and Mohammad
                 Tehranipoor",
  title =        "Detection of {Trojans} using a combined ring
                 oscillator network and off-chip transient power
                 analysis",
  journal =      j-JETC,
  volume =       "9",
  number =       "3",
  pages =        "25:1--25:??",
  month =        sep,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2491677",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Tue Oct 1 18:20:25 MDT 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Verifying the trustworthiness of Integrated Circuits
                 (ICs) is of utmost importance, as hardware Trojans may
                 destroy ICs bound for critical applications. A novel
                 methodology combining on-chip structure with external
                 current measurements is proposed to verify whether or
                 not an IC is Trojan free. This method considers
                 Trojans' impact on neighboring cells and on the entire
                 IC's power consumption, and effectively localizes the
                 measurement of dynamic power. To achieve this, we
                 develop a new on-chip ring oscillator network structure
                 distributed across the entire chip and place each ring
                 oscillator's components in different rows of a
                 standard-cell design. By developing novel statistical
                 data analysis, the effect of process variations on the
                 ICs' transient power will be separated from the effect
                 of Trojans. Simulation results using 90nm technology
                 and experimental results on Xilinx Spartan-6 FPGAs
                 demonstrate the efficiency of our proposed method.",
  acknowledgement = ack-nhfb,
  articleno =    "25",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Guiducci:2013:ISI,
  author =       "Carlotta Guiducci",
  title =        "Introduction to Special Issue on Bioinformatics",
  journal =      j-JETC,
  volume =       "9",
  number =       "4",
  pages =        "26:1--26:??",
  month =        nov,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2536744.2536745",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Wed Nov 27 17:50:48 MST 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  acknowledgement = ack-nhfb,
  articleno =    "26",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Piovesan:2013:ERP,
  author =       "Damiano Piovesan and Giuseppe Profiti and Pier Luigi
                 Martelli and Piero Fariselli and Rita Casadio",
  title =        "Extended and Robust Protein Sequence Annotation over
                 Conservative Nonhierarchical Clusters: The Case Study
                 of the {ABC} Transporters",
  journal =      j-JETC,
  volume =       "9",
  number =       "4",
  pages =        "27:1--27:??",
  month =        nov,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2504729",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Wed Nov 27 17:50:48 MST 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Genome annotation is one of the most important issues
                 in the genomic era. The exponential growth rate of
                 newly sequenced genomes and proteomes urges the
                 development of fast and reliable annotation methods,
                 suited to exploit all the information available in
                 curated databases of protein sequences and structures.
                 To this aim we developed BAR+, the Bologna Annotation
                 Resource. The basic notion is that sequences with high
                 identity value to a counterpart can inherit the same
                 function/s and structure, if available. As a case study
                 we describe how the ATP-binding domain of the ABC
                 transporters can be found and modeled in over 30,000
                 new sequences not annotated before. We also mapped into
                 BAR+ all the ABC transporters listed in the Transporter
                 Classification DataBase and found that within our
                 environment annotation could be extended to another
                 256,866 sequences.",
  acknowledgement = ack-nhfb,
  articleno =    "27",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Abate:2013:ILH,
  author =       "Francesco Abate and Andrea Acquaviva and Elisa Ficarra
                 and Enrico Macii",
  title =        "Integration of Literature with Heterogeneous
                 Information for Genes Correlation Scoring",
  journal =      j-JETC,
  volume =       "9",
  number =       "4",
  pages =        "28:1--28:??",
  month =        nov,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2504728",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Wed Nov 27 17:50:48 MST 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Determining the correlation between biomedical terms
                 is a powerful instrument to help scientist research
                 activity, both to understand experimental results and
                 to design new ones. In particular, a great potential
                 comes from the integration of the many heterogeneous
                 information sources currently available on the Web. In
                 this article we focus on the correlation between genes
                 and biological processes. In this context, we present a
                 methodology for integrating information from biomedical
                 literature with other heterogeneous types of structured
                 information. In particular, the information sources
                 integrated in this work are PubMed abstracts, pathway
                 databases, and NCI thesaurus definitions. The
                 integration is performed at the semantic analysis level
                 using a customized approach we developed to modulate
                 the impact of the different sources on the correlation
                 score. We report the results of a study concerning the
                 impact of the information integration on the
                 correlation score and of the user-level parameters we
                 introduced to modulate the impact of pathway data or
                 NCI definitions with respect to biomedical literature
                 information, depending on the context of the search. To
                 evaluate the methodology, we performed correlation
                 measures on six biological processes and nine genes by
                 comparing the results with and without the integration
                 of pathways and NCI definitions.",
  acknowledgement = ack-nhfb,
  articleno =    "28",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Graziano:2013:HVB,
  author =       "Mariagrazia Graziano and Stefano Frache and Maurizio
                 Zamboni",
  title =        "A Hardware Viewpoint on Biosequence Analysis: What's
                 Next?",
  journal =      j-JETC,
  volume =       "9",
  number =       "4",
  pages =        "29:1--29:??",
  month =        nov,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2504774",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Wed Nov 27 17:50:48 MST 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Biosequence alignment recently received an increasing
                 support from both commodity and dedicated hardware
                 platforms. Processing capabilities are constantly
                 rising, but still not satisfying the limitless
                 requirements of this application. We give an insight on
                 the contribution to this need that can possibly be
                 expected from emerging technology devices and
                 architectures, focusing as an example on nanofabrics
                 based on silicon nanowires. By varying a few parameters
                 we explore the solution space, and demonstrate with
                 proper figures of merit how this family of beyond CMOS
                 structures could be considered as the effective
                 disruptive technology for biosequence analysis
                 applications.",
  acknowledgement = ack-nhfb,
  articleno =    "29",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Venken:2013:SBM,
  author =       "Lyn Venken and Kathleen Marchal and Jos Vanderleyden",
  title =        "Synthetic Biology and Microdevices: a Powerful
                 Combination",
  journal =      j-JETC,
  volume =       "9",
  number =       "4",
  pages =        "30:1--30:??",
  month =        nov,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2504775",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Wed Nov 27 17:50:48 MST 2013",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Recent developments demonstrate that the combination
                 of microbiology with micro- and nanoelectronics is a
                 successful approach to develop new miniaturized sensing
                 devices and other technologies. In the last decade,
                 there has been a shift from the optimization of the
                 abiotic components, for example, the chip, to the
                 improvement of the processing capabilities of cells
                 through genetic engineering. The synthetic biology
                 approach will not only give rise to systems with new
                 functionalities, but will also improve the robustness
                 and speed of their response towards applied signals. To
                 this end, the development of new genetic circuits has
                 to be guided by computational design methods that
                 enable to tune and optimize the circuit response. As
                 the successful design of genetic circuits is highly
                 dependent on the quality and reliability of its
                 composing elements, intense characterization of
                 standard biological parts will be crucial for an
                 efficient rational design process in the development of
                 new genetic circuits. Microengineered devices can
                 thereby offer a new analytical approach for the study
                 of complex biological parts and systems. By summarizing
                 the recent techniques in creating new synthetic
                 circuits and in integrating biology with microdevices,
                 this review aims at emphasizing the power of combining
                 synthetic biology with microfluidics and
                 microelectronics.",
  acknowledgement = ack-nhfb,
  articleno =    "30",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Editors:2014:ISI,
  author =       "Editors",
  title =        "Introduction to special issue on reliability and
                 device degradation in emerging technologies",
  journal =      j-JETC,
  volume =       "10",
  number =       "1",
  pages =        "1:1--1:??",
  month =        jan,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2543749.2543750",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Tue Jan 14 19:15:04 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  acknowledgement = ack-nhfb,
  articleno =    "1",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Kufluoglu:2014:RMN,
  author =       "Haldun K{\"u}fl{\"u}oglu and Cathy Chancellor and Min
                 Chen and Claude Cirba and Vijay Reddy",
  title =        "Recovery modeling of negative bias temperature
                 instability {(NBTI)} for {SPICE}-compatible circuit
                 aging simulators",
  journal =      j-JETC,
  volume =       "10",
  number =       "1",
  pages =        "2:1--2:??",
  month =        jan,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2517648",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Tue Jan 14 19:15:04 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "A feasible computational framework that enables
                 improved predictability of NBTI degradation within
                 commercially available tools is discussed. The NBTI
                 model is used for real-time circuit operation where
                 recovery is present. The complementary nature of
                 implementation is readily incorporated into existing
                 model extraction and verification tools. The method
                 provides significantly enhanced accuracy in simulations
                 when compared to circuit data, yet retains practicality
                 and flexibility.",
  acknowledgement = ack-nhfb,
  articleno =    "2",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Arasu:2014:RIL,
  author =       "Senthil Arasu and Mehrdad Nourani and Vijay Reddy and
                 John M. {Carulli Jr.} and Gautam Kapila and Min Chen",
  title =        "Reliability improvement of logic and clock paths in
                 power-efficient designs",
  journal =      j-JETC,
  volume =       "10",
  number =       "1",
  pages =        "3:1--3:??",
  month =        jan,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2543749.2543751",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Tue Jan 14 19:15:04 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Performance degradation due to transistor aging is a
                 significant impediment to high-performance IC design
                 due to increasing concerns of reliability mechanisms
                 such as negative-bias-temperature-instability (NBTI).
                 The concern only grows with technology scaling as the
                 effects of positive bias temperature instability (PBTI)
                 is becoming prominent in future technologies and
                 compounding with the effects of NBTI. Although aging of
                 transistor is inevitable and the magnitude of
                 degradation due to aging varies depending upon the
                 context. Specifically, in power-efficient systems
                 designs, the logic and clock paths are susceptible to
                 static stress resulting in peak degradation due to BTI
                 occurrence when clock is gated. In this article, we
                 present the reliability impact of making systems power
                 efficient and propose a design-for-reliability
                 methodology that can be used in conjunction with
                 low-power design techniques to alleviate the stress
                 conditions caused by rendering circuits in idle state.
                 The technique- BTI-Refresh, is shown to be applicable
                 to both logic and clock paths alike and focuses on
                 preventing prolonged static stress using periodic
                 refreshes to achieve alternating stress. The mechanism
                 is shown to integrate seamlessly into the design at
                 gate-level without requiring any architectural or
                 RT-level changes. Using ISCAS benchmarks and
                 Kogge-Stone-Adder circuits, it is shown to reduce the
                 aging effect in logic path delay due to static stress
                 by up to 50\% with negligible area and power overhead.
                 BTI-Refresh is extended to clock-paths to prevent
                 pulse-width degradation due to static aging and with
                 minimal clock-skew.",
  acknowledgement = ack-nhfb,
  articleno =    "3",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Sun:2014:WAC,
  author =       "Jin Sun and Roman Lysecky and Karthik Shankar and
                 Avinash Kodi and Ahmed Louri and Janet Roveda",
  title =        "Workload assignment considering {NBTI} degradation in
                 multicore systems",
  journal =      j-JETC,
  volume =       "10",
  number =       "1",
  pages =        "4:1--4:??",
  month =        jan,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2539124",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Tue Jan 14 19:15:04 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "With continuously shrinking technology, reliability
                 issues such as Negative Bias Temperature Instability
                 (NBTI) has resulted in considerable degradation of
                 device performance, and eventually the short
                 mean-time-to-failure (MTTF) of the whole multicore
                 system. This article proposes a new workload balancing
                 scheme based on device-level fractional NBTI model to
                 balance the workload among active cores while relaxing
                 stressed ones. Starting with NBTI-induced threshold
                 voltage degradation, we define a concept of Capacity
                 Rate (CR) as an indication of one core's ability to
                 accept workload. Capacity rate captures core's
                 performance variability in terms of delay and power
                 metrics under the impact of NBTI aging. The proposed
                 workload balancing framework employs the capacity rates
                 as workload constraints, applies a Dynamic Zoning (DZ)
                 algorithm to group cores into zones to process task
                 flows, and then uses Dynamic Task Scheduling (DTS) to
                 allocate tasks in each zone with balanced workload and
                 minimum communication cost. Experimental results on a
                 64-core system show that by allowing a small part of
                 the cores to relax over a short time period, the
                 proposed methodology improves multicore system yield
                 (percentage of core failures) by 20\%, while extending
                 MTTF by 30\% with insignificant degradation in
                 performance (less than 3\%).",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Chabi:2014:RLA,
  author =       "Djaafar Chabi and Damien Querlioz and Weisheng Zhao
                 and Jacques-Olivier Klein",
  title =        "Robust learning approach for neuro-inspired nanoscale
                 crossbar architecture",
  journal =      j-JETC,
  volume =       "10",
  number =       "1",
  pages =        "5:1--5:??",
  month =        jan,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2539123",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Tue Jan 14 19:15:04 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Scaling beyond CMOS require a new combination of
                 computing paradigm and new devices. In this context,
                 memristor are often considered as best candidate to
                 implement efficiently synapses in hardware neural
                 networks. In this article, we analyze the impact of
                 memristor parameter variability. We build an analytical
                 model of the global reliability at the crossbar level.
                 It is based on a supervised learning method with
                 multilayer and redundancy extensions. Comparisons with
                 Monte Carlo simulations of small neural network
                 validate our analytical model. It can be used to
                 extrapolate directly the reliability of large-scale
                 neural system. Our extrapolations show that high defect
                 rate and important parameter variability can be handle
                 efficiency with a moderate amount of redundancy.",
  acknowledgement = ack-nhfb,
  articleno =    "5",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Frache:2014:NAM,
  author =       "Stefano Frache and Mariagrazia Graziano and Maurizio
                 Zamboni",
  title =        "Nanoarray architectures multilevel simulation",
  journal =      j-JETC,
  volume =       "10",
  number =       "1",
  pages =        "6:1--6:??",
  month =        jan,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2541882",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Tue Jan 14 19:15:04 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Density and regularity are deemed as the major
                 advantages of nanoarray architectures based on
                 nanowires. Literature demonstrated that proper
                 reliability analyzes must be performed and solutions
                 have to be devised to improve nanoarrays yield. Their
                 complexity and high-fault probability claim for
                 specific design automation tools able to explore
                 circuit solutions, performance and fault-tolerant
                 approaches. We envision a simulator conceived to carry
                 on characterizations in terms of logic behavior,
                 defect-induced output error rate assessment, switching
                 activity, power and timing performance. Though already
                 existing for traditional technology, a simulator based
                 on specific technological and topological tiled
                 nanoarray descriptions, and conceived to join both
                 device and architecture levels, has never been
                 attempted at the degree of accuracy we present. Our
                 contribution is twofold. First, marking a difference
                 with respect to the state of the art, we developed an
                 algorithm based on an event-driven engine which works
                 at switch level and is not simply built on top of cost
                 functions evaluations. The straightforward advantage is
                 the possibility to follow the evolution of dynamic
                 control sequences throughout all the inner components
                 of the nanoarray, and, as a consequence, to obtain
                 circuit level characterization as a projection of the
                 real internal parameters. Second, we added to our
                 simulator the capability to inject faults with specific
                 statistical distributions associated to the nanoarray
                 topology. Here we extract output error rates and yield
                 for one of the possible nanoarray structures proposed
                 in literature, the NASIC. Results specificity and
                 accuracy demonstrate the simulator trustworthiness, its
                 effectiveness for extensive nanoarrays characterization
                 and its suitability as a foundation for both higher
                 architectural and lower device simulation levels. The
                 aim of this work, then, is to provide insights into the
                 intertwined relation between actual technology and
                 circuit design for these emerging fabrics, and, as a
                 consequence, to clarify how defects and variability
                 affect circuits and systems performance.",
  acknowledgement = ack-nhfb,
  articleno =    "6",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Avritzer:2014:ISI,
  author =       "Alberto Avritzer and Tadashi Dohi",
  title =        "Introduction to special issue on {WoSAR 2011}",
  journal =      j-JETC,
  volume =       "10",
  number =       "1",
  pages =        "7:1--7:??",
  month =        jan,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2543749.2543752",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Tue Jan 14 19:15:04 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  acknowledgement = ack-nhfb,
  articleno =    "7",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Cotroneo:2014:SSA,
  author =       "Domenico Cotroneo and Roberto Natella and Roberto
                 Pietrantuono and Stefano Russo",
  title =        "A survey of software aging and rejuvenation studies",
  journal =      j-JETC,
  volume =       "10",
  number =       "1",
  pages =        "8:1--8:??",
  month =        jan,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2539117",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Tue Jan 14 19:15:04 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Software aging is a phenomenon plaguing many
                 long-running complex software systems, which exhibit
                 performance degradation or an increasing failure rate.
                 Several strategies based on the proactive rejuvenation
                 of the software state have been proposed to counteract
                 software aging and prevent failures. This survey
                 article provides an overview of studies on Software
                 Aging and Rejuvenation (SAR) that have appeared in
                 major journals and conference proceedings, with respect
                 to the statistical approaches that have been used to
                 forecast software aging phenomena and to plan
                 rejuvenation, the kind of systems and aging effects
                 that have been studied, and the techniques that have
                 been proposed to rejuvenate complex software systems.
                 The analysis is useful to identify key results from SAR
                 research, and it is leveraged in this article to
                 highlight trends and open issues.",
  acknowledgement = ack-nhfb,
  articleno =    "8",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Zhao:2014:SRS,
  author =       "Jing Zhao and Yuliang Jin and Kishor S. Trivedi and
                 Rivalino {Matias Jr.} and Yanbin Wang",
  title =        "Software rejuvenation scheduling using accelerated
                 life testing",
  journal =      j-JETC,
  volume =       "10",
  number =       "1",
  pages =        "9:1--9:??",
  month =        jan,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2539118",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Tue Jan 14 19:15:04 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "A number of studies have reported the phenomenon of
                 ``Software aging'', caused by resource exhaustion and
                 characterized by progressive software performance
                 degradation. In this article, we carry out an
                 experimental study of software aging and rejuvenation
                 for an on-line bookstore application, following the
                 standard configuration of TPC-W benchmark. While real
                 website is used for the bookstore, the clients are
                 emulated. In order to reduce the time to application
                 failures caused by memory leaks, we use the accelerated
                 life testing (ALT) approach. We then select the Weibull
                 time to failure distribution at normal level, to be
                 used in a semi-Markov process, to compute the optimal
                 software rejuvenation trigger interval. Since the
                 validation of optimal rejuvenation trigger interval
                 with emulated browsers will take an inordinate long
                 time, we develop a simulation model to validate the ALT
                 experimental results, and also estimate the
                 steady-state availability to cross-validate the results
                 of the semi-Markov availability model.",
  acknowledgement = ack-nhfb,
  articleno =    "9",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Machida:2014:JCT,
  author =       "Fumio Machida and Victor F. Nicola and Kishor S.
                 Trivedi",
  title =        "Job completion time on a virtualized server with
                 software rejuvenation",
  journal =      j-JETC,
  volume =       "10",
  number =       "1",
  pages =        "10:1--10:??",
  month =        jan,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2539121",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Tue Jan 14 19:15:04 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "This article analyzes the completion time of a job
                 running on a virtualized server subject to software
                 aging and rejuvenation in a virtual machine monitor
                 (VMM). A job running on the server may be interrupted
                 by virtual machine (VM) failure, VMM failure or VMM
                 rejuvenation. The job interruption is categorized as
                 either preemptive-repeat ( prt ), in which case the
                 interrupted job needs to restart from the beginning, or
                 preemptive-resume ( prs ), in which case the job
                 resumes execution from the point of interruption. Using
                 a semi-Markov process (SMP) to model the server
                 behavior, the steady-state server availability is
                 computed and the theory developed in Kulkarni et al.
                 [1987] is used to obtain the Laplace--Stieltjes
                 transform (LST) of the job completion time. In the
                 numerical experiments, we introduce four types of aging
                 behavior of VMM. The effectiveness of VMM rejuvenation
                 on job completion time is discussed in association with
                 the type of interruption it causes and the VMM aging
                 type. With our parameter settings, VMM rejuvenation
                 with prs job interruption improves the performance of
                 job execution regardless of the aging type, with
                 performance degradation is taken into account.",
  acknowledgement = ack-nhfb,
  articleno =    "10",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Araujo:2014:SAE,
  author =       "Jean Araujo and Rubens Matos and Vandi Alves and Paulo
                 Maciel and F. Vieira de Souza and Rivalino {Matias Jr.}
                 and Kishor S. Trivedi",
  title =        "Software aging in the {Eucalyptus} cloud computing
                 infrastructure: Characterization and rejuvenation",
  journal =      j-JETC,
  volume =       "10",
  number =       "1",
  pages =        "11:1--11:??",
  month =        jan,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2539122",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Tue Jan 14 19:15:04 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "The need for high reliability, availability and
                 performance has significantly increased in modern
                 applications, that handle rapidly growing demands while
                 providing uninterruptible services. Cloud computing
                 systems fundamentally provide access to large pools of
                 data and computational resources. Eucalyptus is a
                 software framework largely used to implement private
                 clouds and hybrid-style Infrastructure as a Service. It
                 implements the Amazon Web Service (AWS) API, allowing
                 interoperability with other AWS-based services. This
                 article investigates the software aging effects in the
                 Eucalyptus framework, considering workloads composed of
                 intensive requests for remote storage attachment and
                 virtual machine instantiations. We found problems that
                 may be harmful to system dependability and performance,
                 specifically regarding to RAM memory and swap space
                 exhaustion, besides highly excessive CPU utilization by
                 the virtual machines. We also present an approach that
                 applies time series analysis to schedule rejuvenation,
                 so as to reduce the downtime by predicting the proper
                 moment to perform the rejuvenation. We experimentally
                 evaluate our approach using an Eucalyptus test bed. The
                 results show that our approach achieves higher
                 availability, when compared to a threshold-triggered
                 rejuvenation method based on continuous monitoring of
                 resources utilization.",
  acknowledgement = ack-nhfb,
  articleno =    "11",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Chen:2014:CRP,
  author =       "Jifeng Chen and Shuo Wang and Mohammad Tehranipoor",
  title =        "Critical-reliability path identification and delay
                 analysis",
  journal =      j-JETC,
  volume =       "10",
  number =       "2",
  pages =        "12:1--12:??",
  month =        feb,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2564926",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Fri Feb 28 17:06:25 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Circuit reliability analysis at the presilicon stage
                 has become vital for sub-45nm technology designs in
                 particular, due to aging effects, such as Negative Bias
                 Temperature Instability (NBTI) and Hot Carrier
                 Injection (HCI). To avoid potential reliability hazards
                 in the postsilicon stage, current large-scale designs
                 for commercial implementation overpessimistically
                 analyze circuit aging under assumed worst-case workload
                 in order not to violate the corner cases even for low
                 possibilities, thus introducing unnecessary margin in
                 the design timing analysis. The major issue is lack of
                 an effective aging analysis method applicable to large
                 designs with low CPU runtime, which is mainly due to:
                 (1) conventional reliability tools are extremely
                 time-consuming for circuit-level timing analysis and
                 thus are not practical for large designs; (2)
                 mathematical models developed to expedite the process
                 are not accurate due to the high complexity of aging
                 effects. In this article, a comprehensive analysis is
                 presented to highlight the importance of each aging
                 parameter. Then, a novel methodology is developed based
                 on current commercial reliability tools to guarantee
                 its high accuracy on circuit-level aging analysis.
                 Existing proven low-level mathematical models are
                 further enhanced to extensively speed up a higher level
                 analysis by taking advantage of the explicit
                 intermediate conditions stored in a pregenerated lookup
                 table. Our results indicate $ \geq 244 \times $
                 improved computational efficiency, $ \leq 5 \% $
                 relative error, and $ \leq 0.7 \% $ absolute error
                 compared with commercial reliability analysis tools
                 (e.g., HSPICE MOSRA).",
  acknowledgement = ack-nhfb,
  articleno =    "12",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Gladshtein:2014:DBP,
  author =       "Michael Gladshtein",
  title =        "Delay-based processing-in-wire for design of {QCA}
                 serial decimal arithmetic units",
  journal =      j-JETC,
  volume =       "10",
  number =       "2",
  pages =        "13:1--13:??",
  month =        feb,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2564927",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Fri Feb 28 17:06:25 MST 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jetc/;
                 https://www.math.utah.edu/pub/tex/bib/fparith.bib;
                 https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Quantum-dot cellular automata (QCA) technology is now
                 considered to be one of the prospective technologies
                 for a nanocomputer creation. The physical properties of
                 QCA and its expanding range of computer applications
                 make it expedient to use the novel paradigm of
                 nanocomputer architecture: serial decimal
                 storage-transfer-processing. The delay-based encoding
                 of decimal digits allows the use a delay element as a
                 main element of QCA serial arithmetic units. The simple
                 implementation of the delay element by a short length
                 of QCA wire results in reduction of complexity and of
                 the area required for a QCA circuit. The theoretical
                 basics of delay-based processing-in-wire and design
                 examples of QCA serial decimal arithmetic units are
                 presented.",
  acknowledgement = ack-nhfb,
  articleno =    "13",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Lin:2014:RRM,
  author =       "Chia-Chun Lin and Niraj K. Jha",
  title =        "{RMDDS}: {Reed--Muller} decision diagram synthesis of
                 reversible logic circuits",
  journal =      j-JETC,
  volume =       "10",
  number =       "2",
  pages =        "14:1--14:??",
  month =        feb,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2564923",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Fri Feb 28 17:06:25 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "In this article, we propose a flexible and efficient
                 reversible logic synthesizer. It exploits the
                 complementary advantages of two methods: Reed--Muller
                 Reversible Logic Synthesis (RMRLS) and Decision Diagram
                 Synthesis (DDS), and is thus called Reed--Muller
                 Decision Diagram Synthesis (RMDDS). RMRLS does not
                 scale to a large number of qubits (i.e., quantum bits).
                 DDS tools, even though efficient, add a large number of
                 ancillary qubits and typically incur much higher
                 quantum cost than necessary. RMDDS overcomes these
                 obstacles. It is flexible in the sense that users can
                 either optimize the number of qubits or the quantum
                 cost in the circuit implementation. It is also
                 efficient because the circuits can be synthesized
                 within user-defined CPU times. This combination of
                 flexibility and efficiency has been missing from
                 synthesizers presented earlier. When used to synthesize
                 reversible functions, RMDDS reduces the number of
                 qubits by up to 79.2\% (average of 54.6\%) when the
                 synthesis objective is to minimize the number of qubits
                 and the quantum cost by up to 71.5\% (average of
                 35.7\%) when the synthesis objective is to minimize
                 quantum cost, relative to DDS methods. For irreversible
                 functions (which are automatically embedded in
                 reversible functions), the corresponding best (average)
                 reductions in the number of qubits is 42.1\% (22.5\%)
                 when minimizing the number of qubits, and in quantum
                 cost, it is 63.0\% (25.9\%) when minimizing quantum
                 cost.",
  acknowledgement = ack-nhfb,
  articleno =    "14",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Liu:2014:CSN,
  author =       "Weichen Liu and Xuan Wang and Jiang Xu and Wei Zhang
                 and Yaoyao Ye and Xiaowen Wu and Mahdi Nikdast and
                 Zhehui Wang",
  title =        "On-chip sensor networks for soft-error tolerant
                 real-time multiprocessor systems-on-chip",
  journal =      j-JETC,
  volume =       "10",
  number =       "2",
  pages =        "15:1--15:??",
  month =        feb,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2564928",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Fri Feb 28 17:06:25 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "As transistor density continues to increase with the
                 advent of nanotechnology, reliability issues raised by
                 the more frequent appearance of soft errors are
                 becoming critical for future embedded multiprocessor
                 systems design. State-of-the-art techniques for soft
                 error protections targeting multiprocessor systems
                 result either high chip cost and area overhead or high
                 performance degradation and energy consumption, and do
                 not fulfill the increasing requirements for high
                 performance and dependability. In this article we
                 present a systematic approach, that is, the Sensor
                 Networks-on-Chip (SENoC), to collaboratively and
                 efficiently manage on-chip applications and overcome
                 reliability threats to Multiprocessor Systems-on-Chip
                 (MPSoC). A hardware-software collaborative approach is
                 proposed to solve soft error problems: a hardware-based
                 on-chip sensor network is built for soft error
                 detection, and a software-based recovery mechanism is
                 applied for soft error correction. A two-step
                 scheduling scheme is presented for reliable application
                 and chip management, combining an off-line static
                 optimization stage for application performance
                 maximization and an online lightweight dynamic
                 adjustment stage to handle runtime variations and
                 exceptions. This strategy introduces only trivial
                 overhead on hardware design and much lower overhead on
                 software control and execution, and hence performance
                 degradation and energy consumption is greatly reduced.
                 We build a cycle-accurate simulator using SystemC, and
                 verify the effectiveness of our technique by comparing
                 performance with related techniques on several
                 real-world applications.",
  acknowledgement = ack-nhfb,
  articleno =    "15",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Kim:2014:ICU,
  author =       "Jaeyoon Kim and Sandip Tiwari",
  title =        "Inexact computing using probabilistic circuits: Ultra
                 low-power digital processing",
  journal =      j-JETC,
  volume =       "10",
  number =       "2",
  pages =        "16:1--16:??",
  month =        feb,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2564925",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Fri Feb 28 17:06:25 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Numerous computing applications can tolerate low error
                 rates. In such applications, inexact approaches provide
                 the ability to achieve significantly lower power. This
                 work demonstrates the power-error trade-offs that can
                 be achieved. Using probabilistic modeling in sub-50-nm
                 silicon transistor technology, the relationship between
                 statistical uncertainties and errors are elucidated for
                 different configurations and topologies and the
                 trade-offs quantified. Gate-level implementation of the
                 probabilistic CMOS logic is validated by circuit
                 simulations of a commercial 45-nm SOI CMOS process
                 technology. Using a practical ALU architecture where
                 voltages can be scaled from most significant to least
                 significant bit blocks as an example, the potential
                 benefits of this technique are shown. A calculation
                 error of $ 10^{-6} $, an error rate quite tolerable for
                 many computational tasks, is shown to be possible with
                 a total power reduction of more than 40\%.",
  acknowledgement = ack-nhfb,
  articleno =    "16",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Pierce:2014:NTN,
  author =       "Luke Pierce and Spyros Tragoudas",
  title =        "Nanopipelined threshold network synthesis",
  journal =      j-JETC,
  volume =       "10",
  number =       "2",
  pages =        "17:1--17:??",
  month =        feb,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2564924",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Fri Feb 28 17:06:25 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Threshold logic gates allow for complex multiinput
                 functions to be implemented using a single gate thereby
                 reducing the power and area of a circuit. Clocked
                 threshold gates are nanopipelined to increase network
                 throughput. It is shown that synthesis methods that do
                 not consider the synchronization of the nanopipeline
                 can produce an enormous amount of buffers. The proposed
                 algorithm synthesizes a Boolean network into a
                 nanopipelined threshold logic network by minimizing not
                 only the number of combinational clusters but also the
                 associated buffer insertion overhead.",
  acknowledgement = ack-nhfb,
  articleno =    "17",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Xiang:2014:TDT,
  author =       "Dong Xiang and Kele Shen",
  title =        "A thermal-driven test application scheme for pre-bond
                 and post-bond scan testing of three-dimensional {ICs}",
  journal =      j-JETC,
  volume =       "10",
  number =       "2",
  pages =        "18:1--18:??",
  month =        feb,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2564922",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Fri Feb 28 17:06:25 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "The three-dimensional (3-D) technology offers a new
                 solution to the increasing density of integrated
                 circuits (ICs). In this work, we propose novel scan
                 architectures for 3-D IC pre-bond and post-bond testing
                 by considering the interconnection overhead of
                 through-silicon-vias (TSVs). Since hotspots in 3-D ICs
                 often cause performance and reliability issues, we also
                 develop different test ordering schemes for prebond and
                 postbond testing to avoid applying test vectors that
                 could worsen the temperature distribution. Experimental
                 results show that the peak temperature can be lowered
                 by 20\% with the 3-D scan tree architecture. When
                 combined with the test ordering scheme, the 3-D scan
                 tree can further reduce peak temperature by over
                 30\%.",
  acknowledgement = ack-nhfb,
  articleno =    "18",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Kamal:2014:IPV,
  author =       "Mehdi Kamal and Ali Afzali-Kusha and Saeed Safari and
                 Massoud Pedram",
  title =        "Impact of Process Variations on Speedup and Maximum
                 Achievable Frequency of Extensible Processors",
  journal =      j-JETC,
  volume =       "10",
  number =       "3",
  pages =        "19:1--19:??",
  month =        apr,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2567665",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Mon May 5 14:50:39 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "In this article, we investigate the impact of process
                 variations on the speedup and maximum frequency of the
                 extended ISA processor. First, without considering
                 process variations, a custom functional unit (CFU) is
                 designed based on nominal timing parameters, then the
                 timing variations of critical paths of the extensible
                 processor, including the baseline processor and the
                 CFU, are investigated by considering both systematic
                 and random variations. Next, the maximum frequency of
                 the extensible processor and the speed enhancement
                 factor of the extended ISA for different benchmarks are
                 investigated. Results show that timing variation could
                 reduce the speedup of the extensible processor.
                 However, this reduction is highly dependent on the
                 baseline processor and the CFU structures.
                 Additionally, the impact of process variations in the
                 worst-case design approach is studied. Results show
                 that the speedup of the extensible processor is reduced
                 more than in the case when custom instructions (CIs)
                 are selected without considering process variations. To
                 study the impact of each variation type, speedup
                 variations due to random and systematic variations are
                 investigated separately. The study reveals that random
                 variation has a similar effect on the CFU and the
                 baseline processor, while the impact of systematic
                 variation on the baseline processor is greater than the
                 CFU.",
  acknowledgement = ack-nhfb,
  articleno =    "19",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Chung:2014:DET,
  author =       "Haera Chung and Christof Teuscher and Partha Pande",
  title =        "Design and Evaluation of Technology-Agnostic
                 Heterogeneous Networks-on-Chip",
  journal =      j-JETC,
  volume =       "10",
  number =       "3",
  pages =        "20:1--20:??",
  month =        apr,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2567666",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Mon May 5 14:50:39 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Traditional metal-wire-based networks-on-chip (NoC)
                 suffer from high latency and power dissipation as the
                 system size scales up in the number of cores. This
                 limitation stems from the inherent multihop
                 communication nature of larger NoCs. It has previously
                 been shown that the performance of NoCs can be
                 significantly improved by introducing long-range, low
                 power, and high-bandwidth single-hop links between
                 distant cores. While previous work has focused on
                 specific NoC architectures and configurations, it
                 remains an open question whether heterogeneous link
                 types are beneficial in a broad range of NoC
                 architectures. In this article, we show that a generic
                 NoC architecture with heterogeneous link types allows
                 for NoCs with higher bandwidth at a lower cost compared
                 to homogeneous networks. We further show that such NoCs
                 scale up significantly better in terms of performance
                 and cost. We demonstrate these broadly-applicable
                 results by using a technology-agnostic complex network
                 approach that targets NoC architectures with various
                 emerging link types.",
  acknowledgement = ack-nhfb,
  articleno =    "20",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Palaniswamy:2014:ITL,
  author =       "Ashok Kumar Palaniswamy and Spyros Tragoudas",
  title =        "Improved Threshold Logic Synthesis Using
                 Implicant-Implicit Algorithms",
  journal =      j-JETC,
  volume =       "10",
  number =       "3",
  pages =        "21:1--21:??",
  month =        apr,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2597175",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Mon May 5 14:50:39 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Existing threshold logic synthesis methods decompose
                 larger input functions into smaller input functions and
                 perform synthesis for them. It is shown that
                 significantly larger input functions can be synthesized
                 by implementing the existing methods in an
                 implicant-implicit manner. Experimental results on the
                 ISCAS 85 benchmarks show that this impacts the
                 synthesis cost, which drops significantly. More
                 specifically, as the size of the functions that can be
                 handled by the synthesis algorithm increases, the
                 number of threshold logic gates required to implement
                 very large input functions decreases. In addition, the
                 total weight decreases and the performance is
                 improved.",
  acknowledgement = ack-nhfb,
  articleno =    "21",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Chen:2014:CTS,
  author =       "Fu-Wei Chen and Tingting Hwang",
  title =        "Clock-Tree Synthesis with Methodology of Reuse in
                 {$3$D-IC}",
  journal =      j-JETC,
  volume =       "10",
  number =       "3",
  pages =        "22:1--22:??",
  month =        apr,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2567668",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Mon May 5 14:50:39 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "IP reuse methodology has been used extensively in SoC
                 (system-on-chip) design. In this reuse methodology,
                 while design and implementation costs are saved,
                 manufacturing cost is not. To further reduce the cost,
                 this reuse concept has been proposed at mask and die
                 level in three-dimensional integrated circuits (3D-IC).
                 In order to achieve manufacturing reuse, in this
                 article, we propose a new methodology for designing a
                 global clock tree in 3D-IC. The objective is to extend
                 an existing clock tree in 2D IC to 3D IC, taking into
                 consideration the wirelength, clock skew, and the
                 number of TSVs. Compared with NNG- and 3D-MMM-based
                 methods, our proposed method reduces the wirelength of
                 the new die and the skew of the global 3D clock tree on
                 average, 5.85\% and 2.3\%, and 76.92\% and 48.7\%,
                 respectively. In more than two die design, the average
                 improvements of the wirelength and clock skew of our
                 method as compared with the 3D-MMM-based method are
                 4.23\% and 46.84\%, respectively.",
  acknowledgement = ack-nhfb,
  articleno =    "22",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Liu:2014:CHP,
  author =       "Wulong Liu and Yu Wang and Yuchun Ma and Yuan Xie and
                 Huazhong Yang",
  title =        "On-Chip Hybrid Power Supply System for Wireless Sensor
                 Nodes",
  journal =      j-JETC,
  volume =       "10",
  number =       "3",
  pages =        "23:1--23:??",
  month =        apr,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2492683",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Mon May 5 14:50:39 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "With the miniaturization of electronic devices,
                 small-size but high-capacity power supply systems
                 appear to be more and more important. A hybrid power
                 source, which consists of a fuel cell (FC) and a
                 rechargeable battery, has the advantages of long
                 lifetime and good load-following capabilities. In this
                 article, we propose the schematic of a hybrid power
                 supply system that can be integrated on a chip
                 compatible with present CMOS processes. For the
                 on-chip, fuel-cell-based hybrid power system in
                 wireless sensor node design, we propose a two steps
                 optimization: (1) dynamic power management (DPM), and
                 (2) adaptive fuel cell optimal power point tracking
                 (AOPPT). Simulation results demonstrate that the
                 on-chip FC-Bat hybrid power system can be used for
                 wireless sensor nodes under different usage scenarios.
                 Our proposed DPM method can achieve 12.9\% more energy
                 savings than the method without DPM. Meanwhile,
                 implementing our AOPPT approach can save about 17\%
                 energy compared with the fixed architecture for the
                 fuel cell system. For an on-chip power system with
                 1cm$^2$ area consumption, the wafer-level battery can
                 power a typical sensor node for only about five months,
                 while our on-chip hybrid power system will supply the
                 same sensor node for two years steadily.",
  acknowledgement = ack-nhfb,
  articleno =    "23",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Grissom:2014:IAC,
  author =       "Daniel Grissom and Christopher Curtis and Philip
                 Brisk",
  title =        "Interpreting Assays with Control Flow on Digital
                 Microfluidic Biochips",
  journal =      j-JETC,
  volume =       "10",
  number =       "3",
  pages =        "24:1--24:??",
  month =        apr,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2567669",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Mon May 5 14:50:39 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "BioCoder is a C++ library developed at Microsoft
                 Research, India, for the unambiguous specification of
                 biochemical assays. This article describes language
                 extensions to BioCoder along with a compiler and
                 runtime system that translate and execute assays
                 specified using BioCoder on a software simulator. The
                 simulator mimics the behavior of laboratories-on-a-chip
                 (LoCs) based on a droplet actuation technology called
                 electrowetting on dielectric (EWoD). To date, prior
                 compilers targeting similar EWoD devices are limited to
                 assays specified as directed acyclic graphs (DAGs) and
                 cannot handle arbitrary control flow or feedback from
                 the LoC. The framework presented herein addresses these
                 challenges through dynamic interpretation, thereby
                 enlarging the space of assays that can be compiled onto
                 EWoD devices.",
  acknowledgement = ack-nhfb,
  articleno =    "24",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Yuan:2014:FEA,
  author =       "Bo Yuan and Bin Li",
  title =        "A Fast Extraction Algorithm for Defect-Free
                 Subcrossbar in Nanoelectronic Crossbar",
  journal =      j-JETC,
  volume =       "10",
  number =       "3",
  pages =        "25:1--25:??",
  month =        apr,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2517137",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Mon May 5 14:50:39 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Due to the super scale, high defect density, and
                 per-chip designing paradigm of emerging
                 nanoelectronics, the runtime of the algorithms for
                 defect-tolerant design is of vital importance from the
                 perspective of practicability. In this article, an
                 efficient and effective heuristic defect-free
                 subcrossbar extraction algorithm is proposed which
                 improves performance by mixing the heuristics from two
                 state-of-the-art algorithms and then is speeded up
                 significantly by considerably reducing the number of
                 major loops. Compared with the current most effective
                 algorithm that improves the solution quality (i.e.,
                 size of the defect-free subcrossbar obtained) at the
                 cost of high time complexity O ( n$^3$ ), the time
                 complexity of the proposed heuristic algorithm is
                 proved to be O ( n$^2$ ). Using a large set of
                 instances of various scales and defect densities, the
                 simulation results show that the proposed algorithm can
                 offer similar high-quality solutions as the current
                 most effective algorithm while consuming much shorter
                 runtimes (reduced to about 1/3 to 1/5) than the current
                 most effective algorithm.",
  acknowledgement = ack-nhfb,
  articleno =    "25",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Chaudhuri:2014:VDS,
  author =       "Sourindra M. Chaudhuri and Niraj K. Jha",
  title =        "{$3$D} vs. {$2$D} Device Simulation of {FinFET} Logic
                 Gates under {PVT} Variations",
  journal =      j-JETC,
  volume =       "10",
  number =       "3",
  pages =        "26:1--26:??",
  month =        apr,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2567670",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Mon May 5 14:50:39 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Recently, multigate transistors have been gaining
                 attention as an alternative to conventional MOSFETs.
                 Superior gate control over the channel, smaller
                 subthreshold leakage, and reduced susceptibility to
                 process variations are some of the key features that
                 give multigate structures a competitive edge over
                 MOSFETs. Among various multigate structures,
                 silicon-on-insulator (SOI) FinFETs are promising, owing
                 to their ease of fabrication. However, characterization
                 of SOI FinFET devices/gates needs immediate attention
                 in order for them to gain greater popularity in this
                 decade. Ideally, 3D device simulation should be done
                 for accurate circuit analysis. However, this is
                 impractical due to the huge CPU time required. As a
                 possible alternative, simulating a 2D crosssection of
                 the device yields 10$ \times $ to 100$ \times $
                 reduction in CPU time. However, this introduces
                 significant error in the range of 7\% to 20\% when
                 evaluating the on/off current ( I$_{ON}$ /I$_{OFF}$ )
                 for a single device and leakage current or propagation
                 delay ( I$_{LEAK}$ /t$_D$ ) for logic gates. In this
                 work, we first present a methodology to obtain
                 optimized 3D device simulation models for SOI FinFETs.
                 Based on these 3D models, we develop adjusted 2D models
                 to capture 3D simulation accuracy with 2D simulation
                 efficiency. We report results for the 22nm SOI FinFET
                 technology node. We adjust gate underlap ( L$_{UN}$ )
                 in the 2D cross section of the n/pFinFET devices in
                 order to mimic 3D device behavior. When the adjusted 2D
                 models are employed in mixed-mode simulation of FinFET
                 logic gates, the error in the evaluation of I$_{LEAK}$
                 /t$_D$ is very small. To the best of our knowledge,
                 this is the first such attempt. We show that 2D device
                 models remain valid even under process, voltage, and
                 temperature (PVT) variations. We target process
                 variations in gate length ( L$_G$ ), fin thickness (
                 T$_{SI}$ ), gate oxide thickness ( T$_{OX}$ ), and gate
                 workfunction ( \Phi $_G$ ), which are the parameters
                 that have been shown to have the most impact on leakage
                 and delay.",
  acknowledgement = ack-nhfb,
  articleno =    "26",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Lin:2014:POF,
  author =       "Jiun-Li Lin and Po-Hsun Wu and Tsung-Yi Ho",
  title =        "Placement optimization of flexible {TFT} circuits with
                 mechanical strain and temperature consideration",
  journal =      j-JETC,
  volume =       "11",
  number =       "1",
  pages =        "1:1--1:??",
  month =        sep,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2629497",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Mon Oct 6 16:15:58 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Mobility is the primary device parameter affecting
                 circuit performance in flexible thin-film transistor
                 (TFT) technologies, and is particularly sensitive to
                 the change of mechanical strain and temperature.
                 However, existing algorithms only consider the impact
                 of mechanical strain in cell placement of flexible TFT
                 circuits. Without taking temperature into
                 consideration, mobility may be dramatically decreased
                 which leads to circuit performance degradation. This
                 article presents the first work to minimize the
                 mobility variation caused by the change of both
                 mechanical strain and temperature. Experimental results
                 show that the proposed algorithms can effectively and
                 efficiently reduce the increasing critical path
                 delay.",
  acknowledgement = ack-nhfb,
  articleno =    "1",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Roy:2014:TAG,
  author =       "Sudip Roy and Bhargab B. Bhattacharya and Sarmishtha
                 Ghoshal and Krishnendu Chakrabarty",
  title =        "Theory and analysis of generalized mixing and dilution
                 of biochemical fluids using digital microfluidic
                 biochips",
  journal =      j-JETC,
  volume =       "11",
  number =       "1",
  pages =        "2:1--2:??",
  month =        sep,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2629578",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Mon Oct 6 16:15:58 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Digital microfluidic (DMF) biochips are recently being
                 advocated for fast on-chip implementation of
                 biochemical laboratory assays or protocols, and several
                 algorithms for diluting and mixing of reagents have
                 been reported. However, all methods for such automatic
                 sample preparation suffer from a drawback that they
                 assume the availability of input fluids in pure form,
                 that is, each with an extreme concentration factor ( CF
                 ) of 100\%. In many real-life scenarios, the stock
                 solutions consist of samples/reagents with multiple CF
                 s. No algorithm is yet known for preparing a target
                 mixture of fluids with a given ratio when its
                 constituents are supplied with random concentrations.
                 An intriguing question is whether or not a given target
                 ratio is feasible to produce from such a general input
                 condition. In this article, we first study the
                 feasibility properties for the generalized mixing
                 problem under the (1:1) mix-split model with an
                 allowable error in the target CF s not exceeding 1 2d,
                 where the integer d is user specified and denotes the
                 desired accuracy level of CF. Next, an algorithm is
                 proposed which produces the desired target ratio of N
                 reagents in ONd mix-split steps, where N ( {$>$}= 3)
                 denotes the number of constituent fluids in the
                 mixture. The feasibility analysis also leads to the
                 characterization of the total space of input stock
                 solutions from which a given target mixture can be
                 derived, and conversely, the space of all target
                 ratios, which are derivable from a given set of input
                 reagents with arbitrary CF s. Finally, we present a
                 generalized algorithm for diluting a sample S in
                 minimum (1:1) mix-split steps when two or more
                 arbitrary concentrations of S (diluted with the same
                 buffer) are supplied as inputs. These results settle
                 several open questions in droplet-based algorithmic
                 microfluidics and offer efficient solutions for a wider
                 class of on-chip sample preparation problems.",
  acknowledgement = ack-nhfb,
  articleno =    "2",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Chen:2014:ULL,
  author =       "Xianmin Chen and Niraj K. Jha",
  title =        "Ultra-low-leakage chip multiprocessor design with
                 hybrid {FinFET} logic styles",
  journal =      j-JETC,
  volume =       "11",
  number =       "1",
  pages =        "3:1--3:??",
  month =        sep,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2629576",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Mon Oct 6 16:15:58 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "FinFET has begun replacing CMOS at the 22nm technology
                 node because of its enhanced ability to mitigate
                 short-channel effects. Although leakage power of FinFET
                 logic gates is lower than their CMOS counterparts, it
                 still contributes to a large part of total power
                 consumption. In this article, we show how
                 ultra-low-leakage FinFET chip multiprocessors (CMPs)
                 can be designed using a hybrid logic style. This hybrid
                 style exploits the ultra-low-leakage feature of
                 asymmetric-workfunction shorted-gate (ASG) FinFETs and
                 the high-performance feature of shorted-gate (SG)
                 FinFETs. We explore the impact of the hybrid style at
                 both the module and CMP levels. To do this, we have
                 developed FinFET logic libraries targeted at SG and ASG
                 logic gates, suitably characterized for various
                 parameters of interest. We have also modified existing
                 tools and created a framework to evaluate the hybrid
                 designs of SRAMs, caches, and CMPs. Using the design
                 with SG FinFETs as the baseline for comparison, our
                 experimental results show that the hybrid style can
                 reduce leakage power of execution units to as low as
                 10.6\% of the baseline without hurting performance,
                 that of SRAMs to between 21.5\% and 4.8\% of the
                 baseline with 0\%-8.3\% delay overhead, and that of
                 CMPs to 10.0\% of the baseline with negligible
                 performance degradation.",
  acknowledgement = ack-nhfb,
  articleno =    "3",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Lin:2014:NTL,
  author =       "Ing-Chao Lin and Shun-Ming Syu and Tsung-Yi Ho",
  title =        "{NBTI} tolerance and leakage reduction using gate
                 sizing",
  journal =      j-JETC,
  volume =       "11",
  number =       "1",
  pages =        "4:1--4:??",
  month =        sep,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2629657",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Mon Oct 6 16:15:58 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Leakage power is a major design constraint in deep
                 submicron technology and below. Meanwhile, transistor
                 degradation due to Negative Bias Temperature
                 Instability (NBTI) has emerged as one of the main
                 reliability concerns in nanoscale technology. Gate
                 sizing is a widely used technique to reduce circuit
                 leakage, and this approach has recently attracted much
                 attention with regard to improving circuits to tolerate
                 NBTI. However, these studies only consider timing and
                 area constraints, and many other important issues, such
                 as slew and max-load, are missing. In this article, we
                 present an efficient gate sizing framework that can
                 reduce leakage and improve circuit reliability under
                 timing constraints. Our algorithms consider slack, slew
                 and max-load constraints. The benchmarks are those from
                 ISPD 2012, which feature industrial design properties,
                 including discrete cell sizes, nonconvex cell timing
                 models, slew dependencies and constraints, as well as
                 large design sizes. The experimental results obtained
                 from ISPD 2012 benchmark circuits demonstrate that our
                 approach can meet all the constraints and tolerated
                 NBTI degradation with a power savings of 6.54\% as
                 compared with the traditional method.",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Xie:2014:TCP,
  author =       "Jing Xie and Yang Du and Yuan Xie",
  title =        "Testable cross-power domain interface {(CPDI)} circuit
                 design in monolithic {$3$D} technology",
  journal =      j-JETC,
  volume =       "11",
  number =       "1",
  pages =        "5:1--5:??",
  month =        sep,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2629516",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Mon Oct 6 16:15:58 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Optimizing energy consumption for electronic systems
                 has been an important design consideration. Multipower
                 domain design is widely used for low-power and
                 high-performance applications. Data transfer between
                 power domains needs a cross-power domain interface
                 (CPDI). The existing level-conversion flip-flop (LCFF)
                 structures all need dual power rails, which lead to
                 large area and performance overhead. In this article,
                 we propose a scanable CPDI circuit, utilizing
                 monolithic 3D technology. This interface functions as a
                 flip-flop and provides reliable data conversion from
                 one power domain to another. It has a built-in scan
                 feature, which makes it a testable design. Our design
                 separates power rails in each tier, substantially
                 reducing physical design complexity and area penalty.
                 The design is implemented in a 20nm, 28nm, and 45nm
                 low-power technology. It shows a 20\%--35\% smaller
                 insertion delay compared to normal designs. This
                 proposed design also shows scalability and better
                 energy consumption than previous LCFF circuits.",
  acknowledgement = ack-nhfb,
  articleno =    "5",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Kumawat:2014:PMA,
  author =       "Renu Kumawat and Vineet Sahula and Manoj S. Gaur",
  title =        "Probabilistic modeling and analysis of molecular
                 memory",
  journal =      j-JETC,
  volume =       "11",
  number =       "1",
  pages =        "6:1--6:??",
  month =        sep,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2629533",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Mon Oct 6 16:15:58 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "This article investigates the aspects of designing a
                 nanocell based molecular memory. An empirical model for
                 molecular device is developed, based on circuit
                 behavior of nitro-substituted Oligo (Phynylene
                 Ethynylene) molecule (OPE). This device model is
                 subsequently used to design nanocell based 1-bit memory
                 and verified using HSPICE. The approach is extended to
                 train the nanocell for multibit storage capability
                 using external voltage signals. It is observed that to
                 successfully train a 2-bit molecular memory, the number
                 of control signals should be approx. one-fourth of
                 total number of nanoparticles. A computational
                 framework is proposed to compute the probability of
                 retrieving the stored data bits correctly, at the
                 output terminal of the nanocell buffer. This nanocell
                 configuration is simulated by systematically varying
                 number of nanoparticles and molecular switches. It is
                 observed that the probability of the existence of at
                 least one path from input to output approaches close to
                 unity with presence of 20 or more nanoparticles in a
                 nanocell. During memory model validation, 1000 samples
                 of 1-bit memory (consisting of 20 nanoparticles) were
                 generated and verified for read and write operations.
                 The model verification results obtained for this memory
                 cell closely match those obtained using analytical
                 solution of probabilistic graph model.",
  acknowledgement = ack-nhfb,
  articleno =    "6",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Lin:2014:QQM,
  author =       "Chia-Chun Lin and Amlan Chakrabarti and Niraj K. Jha",
  title =        "{QLib}: Quantum module library",
  journal =      j-JETC,
  volume =       "11",
  number =       "1",
  pages =        "7:1--7:??",
  month =        sep,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2629430",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Mon Oct 6 16:15:58 MDT 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Quantum algorithms are known for their ability to
                 solve some problems much faster than classical
                 algorithms. They are executed on quantum circuits,
                 which consist of a cascade of quantum gates. However,
                 synthesis of quantum circuits is not straightforward
                 because of the complexity of quantum algorithms.
                 Generally, quantum algorithms contain two parts:
                 classical and quantum. Thus, synthesizing circuits for
                 the two parts separately reduces overall synthesis
                 complexity. In addition, many quantum algorithms use
                 similar subroutines that can be implemented with
                 similar circuit modules. Because of their frequent use,
                 it is important to use automated scripts to generate
                 such modules efficiently. These modules can then be
                 subjected to further synthesis optimizations. This
                 article proposes QLib, a quantum module library, which
                 contains scripts to generate quantum modules of
                 different sizes and specifications for well-known
                 quantum algorithms. Thus, QLib can also serve as a
                 suite of benchmarks for quantum logic and physical
                 synthesis.",
  acknowledgement = ack-nhfb,
  articleno =    "7",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Wille:2014:ISI,
  author =       "Robert Wille and Rolf Drechsler and Mehdi B. Tahoori",
  title =        "Introduction to the {Special Issue on Reversible
                 Computation}",
  journal =      j-JETC,
  volume =       "11",
  number =       "2",
  pages =        "8:1--8:??",
  month =        nov,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2663349",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Wed Nov 5 18:01:28 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  acknowledgement = ack-nhfb,
  articleno =    "8",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{DeVos:2014:MCC,
  author =       "Alexis {De Vos} and Stijn {De Baerdemacker}",
  title =        "Matrix Calculus for Classical and Quantum Circuits",
  journal =      j-JETC,
  volume =       "11",
  number =       "2",
  pages =        "9:1--9:??",
  month =        nov,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2669370",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Wed Nov 5 18:01:28 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Quantum computation on $w$ qubits is represented by
                 the infinite unitary group $ {\rm U}(2^w) $; classical
                 reversible computation on $w$ bits is represented by
                 the finite symmetric group $ {\rm S}_2^w$. In order to
                 establish the relationship between classical reversible
                 computing and quantum computing, we introduce two Lie
                 subgroups $ {\rm XU}(n)$ and $ {\rm ZU}(n)$ of the
                 unitary group $ {\rm U}(n)$. The former consists of all
                 unitary $ n \times n$ matrices with all line sums equal
                 to $1$; the latter consists of all unitary diagonal $ n
                 \times n$ matrices with first entry equal to $1$. Such
                 a group structure also reveals the relationship between
                 matrix calculus and diagrammatic $ z x$-calculus of
                 quantum circuits.",
  acknowledgement = ack-nhfb,
  articleno =    "9",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Hanninen:2014:QII,
  author =       "Ismo K. H{\"a}nninen and Craig S. Lent and Gregory L.
                 Snider",
  title =        "Quantifying Irreversible Information Loss in Digital
                 Circuits",
  journal =      j-JETC,
  volume =       "11",
  number =       "2",
  pages =        "10:1--10:??",
  month =        nov,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2629523",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Wed Nov 5 18:01:28 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Heat generation limits the performance of
                 state-of-the-art integrated circuits, originating from
                 the wasteful static CMOS operating principle. Near-term
                 solutions like adiabatic charging for energy recovery
                 and limiting friction-type heat sources provide
                 considerable improvement. However, these methods do not
                 address the ultimate thermodynamic necessity to expel
                 energy related to information loss in the computing
                 process. In emerging beyond-CMOS technologies, this bit
                 erasure heat alone can overwhelm the cooling capacity
                 and set the limits of the computing performance.
                 Therefore, logical information loss is becoming an
                 important factor for digital circuit design, and tools
                 have to be developed for analysis and optimization.
                 This article presents a framework for estimating the
                 amount of information loss in complex logic circuits,
                 demonstrating the method by modeling the irreversible
                 bit erasures in a standard binary adder structure.
                 Binary addition is one of the most often used and
                 highly optimized digital designs, and we estimate the
                 erasure bounds for components on various levels of
                 design abstraction, showing that the actual logic gate
                 implementations have orders of magnitude higher loss
                 than the addition operation itself would require. The
                 method and the results can be used to optimize circuits
                 for a higher degree of logical reversibility and energy
                 conservation.",
  acknowledgement = ack-nhfb,
  articleno =    "10",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{DeVos:2014:DGF,
  author =       "Alexis {De Vos} and St{\'e}phane Burignat and Robert
                 Gl{\"u}ck and Torben {\AE}gidius Mogensen and Holger
                 Bock Axelsen and Michael Kirkedal Thomsen and Eva
                 Rotenberg and Tetsuo Yokoyama",
  title =        "Designing Garbage-Free Reversible Implementations of
                 the Integer Cosine Transform",
  journal =      j-JETC,
  volume =       "11",
  number =       "2",
  pages =        "11:1--11:??",
  month =        nov,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2629532",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Wed Nov 5 18:01:28 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Discrete linear transformations are important tools in
                 information processing. Many such transforms are
                 injective and therefore prime candidates for a
                 physically reversible implementation into hardware. We
                 present here reversible integer cosine transformations
                 on $n$ input integers. The resulting reversible circuit
                 is able to perform both the forward transform and the
                 inverse transform. The detailed structure of such a
                 reversible design strongly depends on the odd prime
                 factors of the determinant of the transform: whether
                 those are of the form $ 2^k \pm 1 $ or of the form $
                 2^k \pm 2^l \pm 1 $ or neither of these forms.",
  acknowledgement = ack-nhfb,
  articleno =    "11",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Mogensen:2014:GFR,
  author =       "Torben {\AE}gidius Mogensen",
  title =        "Garbage-Free Reversible Multipliers for Arbitrary
                 Constants",
  journal =      j-JETC,
  volume =       "11",
  number =       "2",
  pages =        "12:1--12:??",
  month =        nov,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2629515",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Wed Nov 5 18:01:28 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "We present a method based on Mealy machines for
                 constructing reversible circuitry for multiplying
                 integers by arbitrary integer constants. The circuits
                 generate no garbage and use no ancillae. The circuits
                 are quite compact for small constants and are, in the
                 worst case, bounded by $ O(n^2) $ multi-control Toffoli
                 gates per bit-slice, where $n$ is the number of bits in
                 the constant. These gates will have $ O(n)$ inputs, so
                 the total number of pass-transistors needed to
                 implement the circuit is $ O(n^3) $ transistors per bit
                 slice, and the quantum cost (which is exponential in
                 the number of inputs to a Toffoli gate) is $ O(2^n)$.
                 For some interesting cases, the cost can be reduced to
                 $ O(n)$ gates per bit-slice, reducing the cost to $
                 O(n^2)$ transistors per bit slice. The quantum cost is
                 still $ O(2^n)$, as the remaining gates have $ O(n)$
                 inputs. We also look at an alternative construction
                 that, at the cost of adding $ O(n)$ ancillae, reduces
                 the cost for arbitrary constants to $ O(n)$ gates, $
                 O(n^2)$ transistors, though still with $ O(2^n)$
                 quantum cost.",
  acknowledgement = ack-nhfb,
  articleno =    "12",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Nguyen:2014:RED,
  author =       "Trung Duc Nguyen and Rodney {Van Meter}",
  title =        "A Resource-Efficient Design for a Reversible Floating
                 Point Adder in Quantum Computing",
  journal =      j-JETC,
  volume =       "11",
  number =       "2",
  pages =        "13:1--13:??",
  month =        nov,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2629525",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Wed Nov 5 18:01:28 MST 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jetc/;
                 https://www.math.utah.edu/pub/tex/bib/fparith.bib;
                 https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Reversible logic has applications in low-power
                 computing and quantum computing. However, there are few
                 existing designs for reversible floating-point adders
                 and none suitable for quantum computation. In this
                 article, we propose a resource-efficient reversible
                 floating-point adder, suitable for binary quantum
                 computation, improving the design of Nachtigal et al.
                 [2011]. Our work focuses on improving the reversible
                 designs of the alignment unit and the normalization
                 unit, which are the most expensive parts. By changing a
                 few elements of the existing algorithm, including the
                 circuit designs of the RLZC (reversible leading zero
                 counter) and converter, we have reduced the cost by
                 about 68\%. We also propose quantum designs adapted to
                 use gates from fault-tolerant libraries. The KQ for our
                 fault-tolerant design is almost 60 times as expensive
                 as for a 32-bit fixed-point addition. We note that the
                 floating-point representation makes in-place, truly
                 reversible arithmetic impossible, requiring us to
                 retain both inputs, which limits the sustainability of
                 its use for quantum computation.",
  acknowledgement = ack-nhfb,
  articleno =    "13",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Shafaei:2014:CSR,
  author =       "Alireza Shafaei and Mehdi Saeedi and Massoud Pedram",
  title =        "Cofactor Sharing for Reversible Logic Synthesis",
  journal =      j-JETC,
  volume =       "11",
  number =       "2",
  pages =        "14:1--14:??",
  month =        nov,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2629524",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Wed Nov 5 18:01:28 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Improving circuit realization of known quantum
                 algorithms by CAD techniques has benefits for quantum
                 experimentalists. In this article, the problem of
                 synthesizing a given function on a set of ancillea is
                 addressed. The proposed approach benefits from
                 extensive sharing of cofactors among cubes that appear
                 on function outputs. Accordingly, it can be considered
                 a multilevel logic optimization technique for
                 reversible circuits. In particular, the suggested
                 approach can efficiently implement any $n$-input,
                 $m$-output lookup table (LUT) by a reversible circuit.
                 This problem has interesting applications in the Shor's
                 number-factoring algorithm and in quantum walk on
                 sparse graphs. Simulation results reveal that the
                 proposed cofactor-sharing synthesis algorithm has a
                 significant impact on reducing the size of modular
                 exponentiation circuits for Shor's quantum factoring
                 algorithm, oracle circuits in quantum walk on sparse
                 graphs, and the well-known MCNC benchmarks.",
  acknowledgement = ack-nhfb,
  articleno =    "14",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Datta:2014:IRC,
  author =       "Kamalika Datta and Gaurav Rathi and Indranil Sengupta
                 and Hafizur Rahaman",
  title =        "An Improved Reversible Circuit Synthesis Approach
                 using Clustering of {ESOP} Cubes",
  journal =      j-JETC,
  volume =       "11",
  number =       "2",
  pages =        "15:1--15:??",
  month =        nov,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2629543",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Wed Nov 5 18:01:28 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "The problem of reversible logic synthesis has drawn
                 the attention of many researchers over the last two
                 decades with growing emphasis on low-power design.
                 Among the various synthesis approaches that have been
                 reported, the ones based on compact circuit
                 representations like Binary Decision Diagrams (BDD) and
                 Exclusive-or Sum-Of-Products (ESOP) are interesting in
                 the sense that they can handle large circuits with more
                 than 100 inputs. The drawback of these approaches,
                 however, is that the generated netlists are
                 sub-optimal, and there is lot of scope for optimizing
                 them. One of the best methods in this regard is an
                 approach, where the ESOP cubes are grouped into
                 sublists based on sharing among more than one outputs.
                 In the work reported in this article, in contrast, an
                 approach based on clustering the ESOP cubes based on
                 their similarity with respect to input variables is
                 presented, along with a technique to map each of the
                 clusters into reversible gate netlists. This approach
                 results in a significant reduction in quantum cost of
                 the final netlist, but requires one additional garbage
                 line. Experimental results on a number of reversible
                 circuit benchmarks have been presented in support of
                 the claim and also demonstrate that the method is very
                 fast.",
  acknowledgement = ack-nhfb,
  articleno =    "15",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Tida:2014:NTS,
  author =       "Umamaheswara Rao Tida and Cheng Zhuo and Yiyu Shi",
  title =        "Novel Through-Silicon-Via Inductor-Based On-Chip
                 {DC--DC} Converter Designs in {$3$D} {ICs}",
  journal =      j-JETC,
  volume =       "11",
  number =       "2",
  pages =        "16:1--16:??",
  month =        nov,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2637481",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Wed Nov 5 18:01:28 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "There has been a tremendous research effort in recent
                 years to move DC-DC converters on chip for enhanced
                 performance. However, a major limiting factor to
                 implementing on-chip inductive DC-DC converters is the
                 large area overhead induced by spiral inductors. Thus,
                 we propose using through-silicon-vias (TSVs), a
                 critical enabling technique in three-dimensional (3D)
                 integrated systems, to implement on-chip inductors for
                 DC-DC converters. While existing literature show that
                 TSV inductors are inferior compared with conventional
                 spiral inductors due to substrate loss for RF
                 applications, in this article, we demonstrate that it
                 is not the case for DC-DC converters, which operate at
                 relatively low frequencies. Experimental results show
                 that by replacing conventional spiral inductors with
                 TSV inductors, with almost the same efficiency and
                 output voltage, up to $ 4.3 \times $ and $ 3.2 \times $
                 inductor area reduction can be achieved for the
                 single-phase buck converter and the interleaved buck
                 converter with magnetic coupling, respectively.",
  acknowledgement = ack-nhfb,
  articleno =    "16",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Murray:2014:PEC,
  author =       "Jacob Murray and Ryan Kim and Paul Wettin and Partha
                 Pratim Pande and Behrooz Shirazi",
  title =        "Performance Evaluation of Congestion-Aware Routing
                 with {DVFS} on a Millimeter-Wave Small-World Wireless
                 {NoC}",
  journal =      j-JETC,
  volume =       "11",
  number =       "2",
  pages =        "17:1--17:??",
  month =        nov,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2644816",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Wed Nov 5 18:01:28 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "The mm-wave small-world wireless NoC (mSWNoC) has
                 emerged as an enabling interconnection infrastructure
                 for designing high-bandwidth and energy-efficient
                 multicore chips. In this mSWNoC architecture,
                 long-range communication predominately takes place
                 through the wireless shortcuts operating in the range
                 of 10--100GHz, whereas short-range data exchange occurs
                 through conventional metal wires. This results in
                 performance advantages (lower latency and energy
                 dissipation), mainly stemming from using the wireless
                 links as long-range shortcuts between far-apart cores.
                 The performance gain introduced by the wireless
                 channels can be enhanced further if the wireline links
                 of the mSWNoC are optimized according to the traffic
                 patterns arising out of the application workloads.
                 While there is significant energy savings, and hence
                 temperature reduction, in the network due to the mSWNoC
                 architecture, a load-imbalanced network is still
                 susceptible to local temperature hotspots. In this
                 work, we demonstrate that by incorporating
                 congestion-avoidance routing with network-level dynamic
                 voltage and frequency scaling (DVFS) in an mSWNoC, the
                 power and thermal profiles can be improved without a
                 significant impact on the overall network performance.
                 In this work, we demonstrate how novel interconnect
                 architectures enabled by the on-chip wireless links
                 coupled with power management strategies can improve
                 the energy and thermal characteristics of an mSWNoC
                 significantly without introducing any performance
                 degradation with respect to the conventional mesh-based
                 NoC.",
  acknowledgement = ack-nhfb,
  articleno =    "17",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Mohanty:2014:SOS,
  author =       "Pragyan (Sheela) Mohanty and Spyros Tragoudas",
  title =        "Scalable Offline Searches in {DNA} Sequences",
  journal =      j-JETC,
  volume =       "11",
  number =       "2",
  pages =        "18:1--18:??",
  month =        nov,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2660774",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Wed Nov 5 18:01:28 MST 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jetc/;
                 https://www.math.utah.edu/pub/tex/bib/jetc.bib;
                 https://www.math.utah.edu/pub/tex/bib/string-matching.bib",
  abstract =     "Searching for a particular pattern in a very large DNA
                 database is a fundamental and essential component in
                 computational biology. In the biological world, pattern
                 matching is required for finding repeats in a
                 particular DNA sequence, finding motif, aligning
                 sequences, and other similar tasks. Due to an immense
                 amount and continuous increase of biological data, the
                 searching process requires very fast algorithms. A
                 function-based tool set for fast offline pattern
                 searches in large DNA sequences is proposed. The method
                 benefits from the use of Boolean functions, their
                 compact storage using canonical data structure, and the
                 existence of built-in operators for these data
                 structures. Experiments on DNA sequences from the NCBI
                 database show that the proposed approach is scalable.
                 The time complexity depends on the size of the data
                 structure used for storing the function that represents
                 the DNA sequence. It is shown that the presented
                 approach exhibits sublinear time complexity to the DNA
                 sequence size.",
  acknowledgement = ack-nhfb,
  articleno =    "18",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Chaudhuri:2014:ALD,
  author =       "Sourindra M. Chaudhuri and Prateek Mishra and Niraj K.
                 Jha",
  title =        "Accurate Leakage\slash Delay Estimation for {FinFET}
                 Standard Cells under {PVT} Variations using the
                 Response Surface Methodology",
  journal =      j-JETC,
  volume =       "11",
  number =       "2",
  pages =        "19:1--19:??",
  month =        nov,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2665066",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Wed Nov 5 18:01:28 MST 2014",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Among different multi-gate transistors, FinFETs and
                 Trigate FETs have set themselves apart as the most
                 promising candidates for the upcoming 22nm technology
                 node and beyond owing to their superior device
                 performance, lower leakage power consumption, and
                 cost-effective fabrication process. Innovative circuit
                 design and optimization techniques will be required to
                 harness the power of multi-gate transistors, which in
                 turn will depend on accurate leakage and timing
                 characterization of these devices under spatial and
                 environmental variations. Hence, in order to aid
                 circuit designers, we present accurate analytical
                 models using central composite rotatable design (CCRD)
                 based on response surface methodology (RSM) to estimate
                 the leakage current and delay of FinFET standard cells
                 under the effect of variations in gate length ($ L_G$),
                 fin thickness ($ T_{SI}$), gate-oxide thickness ($
                 T_{OX}$), gate-workfunction ($ \Phi_G$), supply voltage
                 ($ V{_DD}$), and temperature ($T$). To the best of our
                 knowledge, this is the first such attempt to develop
                 analytical models for leakage/delay estimation of
                 FinFET logic gates. To derive these models, we employ
                 TCAD device simulations of adjusted 2D device cross
                 sections that have been shown to track TCAD device
                 simulations of 3D device behavior within a 1--3\% error
                 range. This drastically reduces the CPU time of our
                 modeling technique (by several orders of magnitude)
                 without much loss in accuracy. We present analytical
                 leakage and delay models for different sizes and logic
                 styles (e.g., shorted-gate (SG) and independent-gate
                 (IG) FinFETs at the 22nm technology node). Both leakage
                 and delay estimates derived from the analytical models
                 are in close agreement with quasi-Monte Carlo (QMC)
                 simulation results (QMC simulations track the accuracy
                 of Monte Carlo simulations, but are several orders of
                 magnitude faster) obtained for different adjusted-2D
                 logic gates with a root mean square error (RMSE) in the
                 0.23\%--5.87\% range.",
  acknowledgement = ack-nhfb,
  articleno =    "19",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Myers:2014:ISI,
  author =       "Chris J. Myers and Herbert Sauro and Anil Wipat",
  title =        "Introduction to the Special Issue on Computational
                 Synthetic Biology",
  journal =      j-JETC,
  volume =       "11",
  number =       "3",
  pages =        "20:1--20:??",
  month =        dec,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2668126",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Wed Jan 7 15:40:14 MST 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "The goal of this special issue is to introduce the
                 field of computational synthetic biology to engineers
                 and computer scientists. The first article gives an
                 introduction to the key biological principles and
                 experimental techniques that support synthetic biology,
                 and it draws analogies with the computing field. This
                 issue also includes five original research articles in
                 computational synthetic biology. The first research
                 article discusses how standards can be used to
                 modularize the design process for genetic circuits. The
                 next two articles introduce new abstraction techniques
                 to improve the efficiency of analysis of genetic
                 circuit models. The last two articles introduce new
                 design techniques that help decouple design from
                 construction. We hope this sampling from the field will
                 help to motivate others to join this exciting and rich
                 area of research.",
  acknowledgement = ack-nhfb,
  articleno =    "20",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Deans:2014:PNS,
  author =       "Tara L. Deans",
  title =        "Parallel Networks: Synthetic Biology and Artificial
                 Intelligence",
  journal =      j-JETC,
  volume =       "11",
  number =       "3",
  pages =        "21:1--21:??",
  month =        dec,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2667229",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Wed Jan 7 15:40:14 MST 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Synthetic biology has emerged as an important
                 technology for engineering cells to behave in
                 controllable and predictable ways. The promise of this
                 modern technology is dependent on our understanding of
                 cellular complexity to allow us to engineer cells with
                 novel function. In this regard, the fields of computer
                 science and synthetic biology are critical for
                 accelerating both our understanding of biological
                 systems, and our ability to quantitatively engineer
                 cells. Thus, advances in biology and biotechnology are
                 arising at the intersection of computer science and
                 synthetic biology approaches. This review seeks to
                 introduce the field of synthetic biology to the
                 computer science community, and to ignite a curiosity
                 and interest in fostering a unique synergy for possible
                 collaborations between synthetic biologists and
                 computer scientists.",
  acknowledgement = ack-nhfb,
  articleno =    "21",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Misirli:2014:CMM,
  author =       "Goksel Misirli and Jennifer Hallinan and Anil Wipat",
  title =        "Composable Modular Models for Synthetic Biology",
  journal =      j-JETC,
  volume =       "11",
  number =       "3",
  pages =        "22:1--22:??",
  month =        dec,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2631921",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Wed Jan 7 15:40:14 MST 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Modelling and computational simulation are crucial for
                 the large-scale engineering of biological circuits
                 since they allow the system under design to be
                 simulated prior to implementation in vivo. To support
                 automated, model-driven design it is desirable that in
                 silico models are modular, composable and use standard
                 formats. The synthetic biology design process typically
                 involves the composition of genetic circuits from
                 individual parts. At the most basic level, these parts
                 are representations of genetic features such as
                 promoters, ribosome binding sites (RBSs), and coding
                 sequences (CDSs). However, it is also desirable to
                 model the biological molecules and behaviour that arise
                 when these parts are combined in vivo. Modular models
                 of parts can be composed and their associated systems
                 simulated, facilitating the process of model-centred
                 design. The availability of databases of modular models
                 is essential to support software tools used in the
                 model-driven design process. In this article, we
                 present an approach to support the development of
                 composable, modular models for synthetic biology,
                 termed Standard Virtual Parts. We then describe a
                 programmatically accessible and publicly available
                 database of these models to allow their use by
                 computational design tools.",
  acknowledgement = ack-nhfb,
  articleno =    "22",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Madsen:2014:SMC,
  author =       "Curtis Madsen and Zhen Zhang and Nicholas Roehner and
                 Chris Winstead and Chris Myers",
  title =        "Stochastic Model Checking of Genetic Circuits",
  journal =      j-JETC,
  volume =       "11",
  number =       "3",
  pages =        "23:1--23:??",
  month =        dec,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2644817",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Wed Jan 7 15:40:14 MST 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Synthetic genetic circuits have a number of exciting
                 potential applications such as cleaning up toxic waste,
                 hunting and killing tumor cells, and producing drugs
                 and bio-fuels more efficiently. When designing and
                 analyzing genetic circuits, researchers are often
                 interested in the probability of observing certain
                 behaviors. Discerning these probabilities typically
                 involves simulating the circuit to produce some time
                 series data and computing statistics over the resulting
                 data. However, for very rare behaviors of complex
                 genetic circuits, it becomes computationally
                 intractable to obtain good results as the number of
                 required simulation runs grows exponentially. It is,
                 therefore, necessary to apply numerical methods to
                 determine these probabilities directly. This article
                 describes how stochastic model checking, a method for
                 determining the likelihood that certain events occur in
                 a system, can by applied to models of genetic circuits
                 by translating them into continuous-time Markov chains
                 (CTMCs) and analyzing them using Markov chain analysis
                 to check continuous stochastic logic (CSL) properties.
                 The utility of this approach is demonstrated with
                 several case studies illustrating how this method can
                 be used to perform design space exploration of two
                 genetic oscillators and two genetic state-holding
                 elements. Our results show that this method results in
                 a substantial speedup as compared with conventional
                 simulation-based approaches.",
  acknowledgement = ack-nhfb,
  articleno =    "23",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Fellermann:2014:FMD,
  author =       "Harold Fellermann and Maik Hadorn and Rudolf M.
                 F{\"u}chslin and Natalio Krasnogor",
  title =        "Formalizing Modularization and Data Hiding in
                 Synthetic Biology",
  journal =      j-JETC,
  volume =       "11",
  number =       "3",
  pages =        "24:1--24:??",
  month =        dec,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2667231",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Wed Jan 7 15:40:14 MST 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Biological systems employ compartmentalization and
                 other co-localization strategies in order to
                 orchestrate a multitude of biochemical processes by
                 simultaneously enabling ``data hiding'' and
                 modularization. This article presents recent research
                 that embraces compartmentalization and co-location as
                 an organizational programmatic principle in synthetic
                 biological and biomimetic systems. In these systems,
                 artificial vesicles and synthetic minimal cells are
                 envisioned as nanoscale reactors for programmable
                 biochemical synthesis and as chassis for molecular
                 information processing. We present P systems, brane
                 calculi, and the recently developed chemtainer calculus
                 as formal frameworks providing data hiding and
                 modularization and thus enabling the representation of
                 highly complicated hierarchically organized
                 compartmentalized reaction systems. We demonstrate how
                 compartmentalization can greatly reduce the complexity
                 required to implement computational functionality, and
                 how addressable compartments permit the scaling-up of
                 programmable chemical synthesis.",
  acknowledgement = ack-nhfb,
  articleno =    "24",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Oberortner:2014:RBD,
  author =       "Ernst Oberortner and Swapnil Bhatia and Erik Lindgren
                 and Douglas Densmore",
  title =        "A Rule-Based Design Specification Language for
                 Synthetic Biology",
  journal =      j-JETC,
  volume =       "11",
  number =       "3",
  pages =        "25:1--25:??",
  month =        dec,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2641571",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Wed Jan 7 15:40:14 MST 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Synthetic Biology is an engineering discipline where
                 parts of DNA sequences are composed into novel, complex
                 systems that execute a desired biological function.
                 Functioning and well-behaving biological systems adhere
                 to a certain set of biological ``rules''. Data exchange
                 standards and Bio-Design Automation (BDA) tools support
                 the organization of part libraries and the exploration
                 of rule-compliant compositions. In this work, we
                 formally define a design specification language,
                 enabling the integration of biological rules into the
                 Synthetic Biology engineering process. The supported
                 rules are divided into five categories: Counting,
                 Pairing, Positioning, Orientation, and Interactions. We
                 formally define the semantics of each rule,
                 characterize the language's expressive power, and
                 perform a case study in that we iteratively design a
                 genetic Priority Encoder circuit following two
                 alternative paradigms-rule-based and template-driven.
                 Ultimately, we touch a method to approximate the
                 complexity and time to computationally enumerate all
                 rule-compliant designs. Our specification language may
                 or may not be expressive enough to capture all designs
                 that a Synthetic Biologist might want to describe, or
                 the complexity one might find through experiments.
                 However, computational support for the acquisition,
                 specification, management, and application of
                 biological rules is inevitable to understand the
                 functioning of biology.",
  acknowledgement = ack-nhfb,
  articleno =    "25",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Huang:2014:FMD,
  author =       "Haiyao Huang and Douglas Densmore",
  title =        "{Fluigi}: Microfluidic Device Synthesis for Synthetic
                 Biology",
  journal =      j-JETC,
  volume =       "11",
  number =       "3",
  pages =        "26:1--26:??",
  month =        dec,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2660773",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Wed Jan 7 15:40:14 MST 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "One goal of synthetic biology is to design and build
                 genetic circuits in living cells for a range of
                 applications. Our incomplete knowledge of the effects
                 of metabolic load and biological ``crosstalk'' on the
                 host cell make it difficult to construct multilevel
                 genetic logic circuits in a single cell, limiting the
                 scalability of engineered biological systems.
                 Microfluidic technologies provide reliable and scalable
                 construction of synthetic biological systems by
                 allowing compartmentalization of cells encoding simple
                 genetic circuits and the spatiotemporal control of
                 communication among these cells. This control is
                 achieved via valves on the microfluidics chip which
                 restrict fluid flow when activated. We describe a
                 Computer Aided Design (CAD) framework called ``Fluigi''
                 for optimizing the layout of genetic circuits on a
                 microfluidic chip, generating the control sequence of
                 the associated signaling fluid valves, and simulating
                 the behavior of the configured biological circuits. We
                 demonstrate the capabilities of Fluigi on a set of
                 Boolean algebraic benchmark circuits found in both
                 synthetic biology and electrical engineering and a set
                 of assay-based benchmark circuits. The integration of
                 microfluidics and synthetic biology has the capability
                 to increase the scale of engineered biological systems
                 for applications in DNA assembly, biosensors, and
                 screening assays for novel orthogonal genetic parts.",
  acknowledgement = ack-nhfb,
  articleno =    "26",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Hadjam:2014:RED,
  author =       "Fatima Zohra Hadjam and Claudio Moraga",
  title =        "{RIMEP2}: Evolutionary Design of Reversible Digital
                 Circuits",
  journal =      j-JETC,
  volume =       "11",
  number =       "3",
  pages =        "27:1--27:??",
  month =        dec,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2629534",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Wed Jan 7 15:40:14 MST 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "RIMEP (Reversible Improved Multi Expression
                 Programming), is a system that has been developed for
                 designing reversible digital circuits. This article
                 discloses a new version of RIMEP called ``RIMEP2''. The
                 goal was to evolve reversible circuits in a ``fanout
                 free'' search space. The major changes that RIMEP has
                 undergone, are made in the structure of the chromosome
                 and in the fitness calculation. Although the changes
                 seem to be minor, the impact is effective. The
                 execution time has been considerably decreased and
                 optimal competitive solutions were found for a set of
                 30 selected benchmarks, where a quantum cost reduction
                 up to 96.13\% was reached with an average of 42.17\%.",
  acknowledgement = ack-nhfb,
  articleno =    "27",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Houshmand:2014:DDH,
  author =       "Mahboobeh Houshmand and Morteza Saheb Zamani and Mehdi
                 Sedighi and Mona Arabzadeh",
  title =        "Decomposition of Diagonal {Hermitian} Quantum Gates
                 Using Multiple-Controlled {Pauli} {Z} Gates",
  journal =      j-JETC,
  volume =       "11",
  number =       "3",
  pages =        "28:1--28:??",
  month =        dec,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2629526",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Wed Jan 7 15:40:14 MST 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Quantum logic decomposition refers to decomposing a
                 given quantum gate to a set of physically implementable
                 gates. An approach has been presented to decompose
                 arbitrary diagonal quantum gates to a set of
                 multiplexed-rotation gates around z axis. In this
                 article, a special class of diagonal quantum gates,
                 namely diagonal Hermitian quantum gates, is considered
                 and a new perspective to the decomposition problem with
                 respect to decomposing these gates is presented. It is
                 first shown that these gates can be decomposed to a set
                 that solely consists of multiple-controlled Z gates.
                 Then a binary representation for the diagonal Hermitian
                 gates is introduced. It is shown that the binary
                 representations of multiple-controlled Z gates form a
                 basis for the vector space that is produced by the
                 binary representations of all diagonal Hermitian
                 quantum gates. Moreover, the problem of decomposing a
                 given diagonal Hermitian gate is mapped to the problem
                 of writing its binary representation in the specific
                 basis mentioned previously. Moreover, CZ gate is
                 suggested to be the two-qubit gate in the decomposition
                 library, instead of previously used CNOT gate.
                 Experimental results show that the proposed approach
                 can lead to circuits with lower costs in comparison
                 with the previous ones.",
  acknowledgement = ack-nhfb,
  articleno =    "28",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Li:2014:SAB,
  author =       "Zhiqiang Li and Hanwu Chen and Xiaoyu Song and Marek
                 Perkowski",
  title =        "A Synthesis Algorithm for $4$-Bit Reversible Logic
                 Circuits with Minimum Quantum Cost",
  journal =      j-JETC,
  volume =       "11",
  number =       "3",
  pages =        "29:1--29:??",
  month =        dec,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2629542",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Wed Jan 7 15:40:14 MST 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "This article presents an algorithm which can quickly
                 find the exact minimum solution to almost all of 4-bit
                 reversible functions. We assume minimization of quantum
                 cost (MQC). This algorithm is designed in the most
                 memory-efficient way, or it will quickly run out of
                 memory. Therefore, we construct the shortest coding of
                 permutations, the topological compression and flexible
                 data structures for the memory savings. First, hash
                 tables are used for all 8-gate 4-bit circuits with the
                 minimization of gate count (MGC) by using the GT
                 library (with NOT, CNOT, Toffoli and Toffoli-4 gates).
                 Second, we merge and split the hash tables, thus
                 generating a single longer hash table for
                 high-performance. Third, we synthesize these circuits
                 with MQC by using the GTP library (with GT, Peres, and
                 Inverted Peres gates) based on the hash table. Finally,
                 according to the comparison of the QC of circuits, the
                 algorithm can quickly converge for any 4-bit reversible
                 circuit with MQC. By synthesizing all benchmark
                 functions, in comparison with Szyprowski and Kerntopf
                 [2011], the running time and QC are reduced up to
                 99.95\% and 18.2\%, respectively.",
  acknowledgement = ack-nhfb,
  articleno =    "29",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Sen:2014:RRC,
  author =       "Bibhash Sen and Manojit Dutta and Samik Some and
                 Biplab K. Sikdar",
  title =        "Realizing Reversible Computing in {QCA} Framework
                 Resulting in Efficient Design of Testable {ALU}",
  journal =      j-JETC,
  volume =       "11",
  number =       "3",
  pages =        "30:1--30:??",
  month =        dec,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2629538",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Wed Jan 7 15:40:14 MST 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Reversible logic is emerging as a prospective logic
                 design style for implementing ultra-low-power VLSI
                 circuits. It promises low-power consuming circuits by
                 nullifying the energy dissipation in irreversible
                 logic. On the other hand, as a potential alternative to
                 CMOS technology, Quantum-dot Cellular Automata (QCA)
                 promises energy efficient digital design with high
                 device density and high computing speed. The
                 integration of reversible logic in QCA circuit is
                 expected to be effective in addressing the issue of
                 energy dissipation at nano scale regime. This work
                 targets the design of reversible ALU (arithmetic logic
                 unit) in QCA framework and proposes a new ``Reversible
                 QCA'' (RQCA). The primary design focus is on optimizing
                 the number of reversible gates, quantum cost and the
                 garbage outputs that are the most important hindrances
                 in realizing reversible logic. Besides optimization,
                 the fault coverage capability of RQCA under
                 missing/additional cell deposition defects is analysed.
                 The scope of reversible logic is further outstretched
                 by introducing a novel DFT (design for testability)
                 architecture around the reversible ALU that reduces
                 testing overhead. The performance of proposed ALU is
                 evaluated, subjected to different faults, and is
                 established to be more effective than the existing
                 ALU.",
  acknowledgement = ack-nhfb,
  articleno =    "30",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Rahman:2014:AQT,
  author =       "Md. Mazder Rahman and Gerhard W. Dueck and Joseph D.
                 Horton",
  title =        "An Algorithm for Quantum Template Matching",
  journal =      j-JETC,
  volume =       "11",
  number =       "3",
  pages =        "31:1--31:??",
  month =        dec,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2629537",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Wed Jan 7 15:40:14 MST 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Quantum circuits are often generated by decomposing
                 gates from networks with classical reversible gates.
                 Only in rare cases, the results are minimal.
                 Post-optimization methods, such as template matching,
                 are employed to reduce the quantum costs of circuits.
                 Quantum templates are derived from identity circuits.
                 All minimal realizations, within certain limitations,
                 can be embedded into templates. Due to this property,
                 templates matching has the potential to reduce quantum
                 costs of circuits. However, one of the difficulties in
                 finding templates matches is due to the mobility of the
                 gates within the circuit. Thus far, template matching
                 procedures have employed heuristics to reduce the
                 search space. This article presents an in-depth study
                 of exact template matching with a set of algorithms. A
                 graph structure with the corresponding circuits
                 facilitates the discovery of potential sequences of
                 templates to be matched, and how exact minimization of
                 circuits can be accomplished. The significance of the
                 proposed method is verified in benchmarks
                 optimization.",
  acknowledgement = ack-nhfb,
  articleno =    "31",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Hammerstrom:2015:ISI,
  author =       "Dan Hammerstrom and Vijaykrishnan Narayanan",
  title =        "Introduction to Special Issue on Neuromorphic
                 Computing",
  journal =      j-JETC,
  volume =       "11",
  number =       "4",
  pages =        "32:1--32:??",
  month =        apr,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2728709",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Tue Apr 28 05:59:37 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  acknowledgement = ack-nhfb,
  articleno =    "32",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Rodriguez:2015:TSS,
  author =       "Laurent Rodriguez and Beno{\^\i}t Miramond and
                 Bertrand Granado",
  title =        "Toward a Sparse Self-Organizing Map for Neuromorphic
                 Architectures",
  journal =      j-JETC,
  volume =       "11",
  number =       "4",
  pages =        "33:1--33:??",
  month =        apr,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2638559",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Tue Apr 28 05:59:37 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Neurobiological systems have often been a source of
                 inspiration for computational science and engineering,
                 but in the past their impact has also been limited by
                 the understanding of biological models. Today, new
                 technologies lead to an equilibrium situation where
                 powerful and complex computers bring new biological
                 knowledge of the brain behavior. At this point, we
                 possess sufficient understanding to both imagine new
                 brain-inspired computing paradigms and to sustain a
                 classical paradigm which reaches its end programming
                 and intellectual limitations. In this context, we
                 propose to reconsider the computation problem first in
                 the specific domain of mobile robotics. Our main
                 proposal consists in considering computation as part of
                 a global adaptive system, composed of sensors,
                 actuators, a source of energy and a controlling unit.
                 During the adaptation process, the proposed
                 brain-inspired computing structure does not only
                 execute the tasks of the application but also reacts to
                 the external stimulation and acts on the emergent
                 behavior of the system. This approach is inspired by
                 cortical plasticity in mammalian brains and suggests
                 developing the computation architecture along the
                 system's experience. This article proposes modeling
                 this plasticity as a problem of estimating a
                 probability density function. This function would
                 correspond to the nature and the richness of the
                 environment perceived through multiple modalities. We
                 define and develop a novel neural model solving the
                 problem in a distributed and sparse manner. And we
                 integrate this neural map into a bio-inspired hardware
                 substrate that brings the plasticity property into
                 parallel many-core architectures. The approach is then
                 called Hardware Plasticity. The results show that the
                 self-organization properties of our model solve the
                 problem of multimodal sensory data clusterization. The
                 properties of the proposed model allow envisaging the
                 deployment of this adaptation layer into hardware
                 architectures embedded into the robot's body in order
                 to build intelligent controllers.",
  acknowledgement = ack-nhfb,
  articleno =    "33",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Chabi:2015:CUS,
  author =       "Djaafar Chabi and Weisheng Zhao and Damien Querlioz
                 and Jacques-Olivier Klein",
  title =        "On-Chip Universal Supervised Learning Methods for
                 Neuro-Inspired Block of Memristive Nanodevices",
  journal =      j-JETC,
  volume =       "11",
  number =       "4",
  pages =        "34:1--34:??",
  month =        apr,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2629503",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Tue Apr 28 05:59:37 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Scaling down beyond CMOS transistors requires the
                 combination of new computing paradigms and novel
                 devices. In this context, neuromorphic architecture is
                 developed to achieve robust and ultra-low power
                 computing systems. Memristive nanodevices are often
                 associated with this architecture to implement
                 efficiently synapses for ultra-high density. In this
                 article, we investigate the design of a neuro-inspired
                 logic block (NLB) dedicated to on-chip function
                 learning and propose learning strategy. It is composed
                 of an array of memristive nanodevices as synapses
                 associated to neuronal circuits. Supervised learning
                 methods are proposed for different type of memristive
                 nanodevices and simulations are performed to
                 demonstrate the ability to learn logic functions with
                 memristive nanodevices. Benefiting from a compact
                 implementation of neuron circuits and the optimization
                 of learning process, this architecture requires small
                 number of nanodevices and moderate power consumption.",
  acknowledgement = ack-nhfb,
  articleno =    "34",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Coussy:2015:FBN,
  author =       "Philippe Coussy and Cyrille Chavet and Hugues Nono
                 Wouafo and Laura Conde-Canencia",
  title =        "Fully Binary Neural Network Model and Optimized
                 Hardware Architectures for Associative Memories",
  journal =      j-JETC,
  volume =       "11",
  number =       "4",
  pages =        "35:1--35:??",
  month =        apr,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2629510",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Tue Apr 28 05:59:37 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Brain processes information through a complex
                 hierarchical associative memory organization that is
                 distributed across a complex neural network. The GBNN
                 associative memory model has recently been proposed as
                 a new class of recurrent clustered neural network that
                 presents higher efficiency than the classical models.
                 In this article, we propose computational
                 simplifications and architectural optimizations of the
                 original GBNN. This work leads to significant
                 complexity and area reduction without affecting neither
                 memorizing nor retrieving performance. The obtained
                 results open new perspectives in the design of
                 neuromorphic hardware to support large-scale
                 general-purpose neural algorithms.",
  acknowledgement = ack-nhfb,
  articleno =    "35",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Krichmar:2015:LSS,
  author =       "Jeffrey L. Krichmar and Philippe Coussy and Nikil
                 Dutt",
  title =        "Large-Scale Spiking Neural Networks using Neuromorphic
                 Hardware Compatible Models",
  journal =      j-JETC,
  volume =       "11",
  number =       "4",
  pages =        "36:1--36:??",
  month =        apr,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2629509",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Tue Apr 28 05:59:37 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Neuromorphic engineering is a fast growing field with
                 great potential in both understanding the function of
                 the brain, and constructing practical artifacts that
                 build upon this understanding. For these novel chips
                 and hardware to be useful, hardware compatible
                 applications and simulation tools are needed. We argue
                 that the neural circuit approach, in which networks of
                 neuronal elements model brain circuitry are
                 constructed, allows the development of practical
                 applications and the exploration of brain function. At
                 this level of abstraction, networks of 10$^5$ neurons
                 or larger can be efficiently simulated, but still
                 preserve the neuronal and synaptic dynamics that appear
                 to be important for brain function. Because the neural
                 circuit level supports spiking neural networks and the
                 prevalent Addressable Event Representation (AER)
                 communication scheme, it fits well with many existing
                 neuromorphic hardware and simulation tools. To show how
                 this approach can be applied, we present case studies
                 of spiking neural networks in vision and recognition
                 tasks based on one instantiation of a simulation
                 environment. However, there are now many hardware
                 options, simulation environments, and applications in
                 this emerging field. These approaches and other
                 considerations are discussed.",
  acknowledgement = ack-nhfb,
  articleno =    "36",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{ChappetDeVangel:2015:RSD,
  author =       "Beno{\^\i}t {Chappet De Vangel} and Cesar
                 Torres-huitzil and Bernard Girau",
  title =        "Randomly Spiking Dynamic Neural Fields",
  journal =      j-JETC,
  volume =       "11",
  number =       "4",
  pages =        "37:1--37:??",
  month =        apr,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2629517",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Tue Apr 28 05:59:37 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Bio-inspired neural computation attracts a lot of
                 attention as a possible solution for the future
                 challenges in designing computational resources.
                 Dynamic neural fields (DNF) provide cortically inspired
                 models of neural populations to which computation can
                 be applied for a wide variety of tasks, such as
                 perception and sensorimotor control. DNFs are often
                 derived from continuous neural field theory (CNFT). In
                 spite of the parallel structure and regularity of CNFT
                 models, few studies of hardware implementations have
                 been carried out targeting embedded real-time
                 processing. In this article, a hardware-friendly model
                 adapted from the CNFT is introduced, namely the RSDNF
                 model (randomly spiking dynamic neural fields). Thanks
                 to their simplified 2D structure, RSDNFs achieve
                 scalable parallel implementations on digital hardware
                 while maintaining the behavioral properties of CNFT
                 models. Spike-based computations within neurons in the
                 field are introduced to reduce interneuron connection
                 bandwidth. Additionally, local stochastic spike
                 propagation ensures inhibition and excitation broadcast
                 without a fully connected network. The behavioral
                 soundness and robustness of the model in the presence
                 of noise and distracters is fully validated through
                 software and hardware. A field programmable gate array
                 (FPGA) implementation shows how the RSDNF model ensures
                 a level of density and scalability out of reach for
                 previous hardware implementations of dynamic neural
                 field models.",
  acknowledgement = ack-nhfb,
  articleno =    "37",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Kim:2015:RDN,
  author =       "Yongtae Kim and Yong Zhang and Peng Li",
  title =        "A Reconfigurable Digital Neuromorphic Processor with
                 Memristive Synaptic Crossbar for Cognitive Computing",
  journal =      j-JETC,
  volume =       "11",
  number =       "4",
  pages =        "38:1--38:??",
  month =        apr,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2700234",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Tue Apr 28 05:59:37 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "This article presents a brain-inspired reconfigurable
                 digital neuromorphic processor (DNP) architecture for
                 large-scale spiking neural networks. The proposed
                 architecture integrates an arbitrary number of N
                 digital leaky integrate-and-fire (LIF) silicon neurons
                 to mimic their biological counterparts and on-chip
                 learning circuits to realize spike-timing-dependent
                 plasticity (STDP) learning rules. We leverage memristor
                 nanodevices to build an N $ \times $ N crossbar array
                 to store not only multibit synaptic weight values but
                 also network configuration data with significantly
                 reduced area overhead. Additionally, the crossbar array
                 is designed to be accessible both column- and row-wise
                 to expedite the synaptic weight update process for
                 learning. The proposed digital pulse width modulator
                 (PWM) produces binary pulses with various durations for
                 reading and writing the multilevel memristive crossbar.
                 The proposed column based analog-to-digital conversion
                 (ADC) scheme efficiently accumulates the presynaptic
                 weights of each neuron and reduces silicon area
                 overhead by using a shared arithmetic unit to process
                 the LIF operations of all N neurons. With 256 silicon
                 neurons, learning circuits and 64K synapses, the power
                 dissipation and area of our DNP are 6.45 mW and 1.86
                 mm$^2$, respectively, when implemented in a 90-nm CMOS
                 technology. The functionality of the proposed DNP
                 architecture is demonstrated by realizing an
                 unsupervised-learning based character recognition
                 system.",
  acknowledgement = ack-nhfb,
  articleno =    "38",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Daneshtalab:2015:SIE,
  author =       "Masoud Daneshtalab and Farhad Mehdipour and Zhiyi Yu
                 and Hannu Tenhunen",
  title =        "Special Issue on Emerging Many-Core Systems for
                 Exascale Computing",
  journal =      j-JETC,
  volume =       "11",
  number =       "4",
  pages =        "39:1--39:??",
  month =        apr,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2717312",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Tue Apr 28 05:59:37 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  acknowledgement = ack-nhfb,
  articleno =    "39",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Jafri:2015:AID,
  author =       "Syed M. A. H. Jafri and Ozan Ozbag and Nasim Farahini
                 and Kolin Paul and Ahmed Hemani and Juha Plosila and
                 Hannu Tenhunen",
  title =        "Architecture and Implementation of Dynamic
                 Parallelism, Voltage and Frequency Scaling {(PVFS)} on
                 {CGRAs}",
  journal =      j-JETC,
  volume =       "11",
  number =       "4",
  pages =        "40:1--40:??",
  month =        apr,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2700250",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Tue Apr 28 05:59:37 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "In the era of platforms hosting multiple applications
                 with arbitrary performance requirements, providing a
                 worst-case platform-wide voltage/frequency operating
                 point is neither optimal nor desirable. As a solution
                 to this problem, designs commonly employ dynamic
                 voltage and frequency scaling (DVFS). DVFS promises
                 significant energy and power reductions by providing
                 each application with the operating point (and hence
                 the performance) tailored to its needs. To further
                 enhance the optimization potential, recent works
                 interleave dynamic parallelism with conventional DVFS.
                 The induced parallelism results in performance gains
                 that allow an application to lower its operating point
                 even further (thereby saving energy and power
                 consumption). However, the existing works employ costly
                 dedicated hardware (for synchronization) and rely
                 solely on greedy algorithms to make parallelism
                 decisions. To efficiently integrate parallelism with
                 DVFS, compared to state-of-the-art, we exploit the
                 reconfiguration (to reduce DVFS synchronization
                 overheads) and enhance the intelligence of the greedy
                 algorithm (to make optimal parallelism decisions).
                 Specifically, our solution relies on dynamically
                 reconfigurable isolation cells and an autonomous
                 parallelism, voltage, and frequency selection
                 algorithm. The dynamically reconfigurable isolation
                 cells reduce the area overheads of DVFS circuitry by
                 configuring the existing resources to provide
                 synchronization. The autonomous parallelism, voltage,
                 and frequency selection algorithm ensures high power
                 efficiency by combining parallelism with DVFS. It
                 selects that parallelism, voltage, and frequency trio
                 which consumes minimum power to meet the deadlines on
                 available resources. Synthesis and simulation results
                 using various applications/algorithms (WLAN, MPEG4,
                 FFT, FIR, matrix multiplication) show that our solution
                 promises significant reduction in area and power
                 consumption (23\% and 51\% ) compared to
                 state-of-the-art.",
  acknowledgement = ack-nhfb,
  articleno =    "40",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Olorode:2015:IPS,
  author =       "Oluleye Olorode and Mehrdad Nourani",
  title =        "Improving Performance in Sub-Block Caches with
                 Optimized Replacement Policies",
  journal =      j-JETC,
  volume =       "11",
  number =       "4",
  pages =        "41:1--41:??",
  month =        apr,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2668127",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Tue Apr 28 05:59:37 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Recent advances in computer processor design have led
                 to the introduction of sub-blocking to cache
                 architectures. Sub-block caches reduce the tag area and
                 power overhead in caches without reducing the effective
                 cache size by using fewer tags to index the full data
                 RAM array. In spite of achieving reduced area and power
                 overhead, sub-block caches suffer performance
                 degradation due to cache trashing. This occurs when a
                 wider cache line (super-block), made up of multiple
                 valid cache lines (sub-blocks), is replaced or evicted
                 when only a sub-block is to be allocated into the wider
                 super-block. To address this problem, we propose cache
                 replacement policies as they relate specifically to
                 sub-block caches. We propose new replacement policies
                 that are tuned for sub-block caches by adding more
                 intelligence based on the valid state of individual
                 sub-blocks of a super-block. We also investigate the
                 effect of using a few level-0 registers to bypass a few
                 level-1 cache pipe stages on sub-block cache
                 performance. To evaluate the performance improvement
                 offered by our proposed replacement policies and the
                 use of level-0 registers, we developed a sub-block
                 cache simulator based on the Simplescalar toolset for
                 single-core evaluations and the Sniper Simulator for
                 multicore evaluations. We show that, with minimal
                 architectural updates to existing conventional cache
                 replacement policies, we are able to improve level-1
                 cache hit rates by up to 4.17\% using our proposed
                 policies alone on SPEC2006 benchmarks and up to 14\% in
                 shared level-2 caches using multicore benchmark suites:
                 PARSEC and SPLASH2.",
  acknowledgement = ack-nhfb,
  articleno =    "41",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Li:2015:ICI,
  author =       "Zhongqi Li and Nilanjan Goswami and Tao Li",
  title =        "{iConn}: a Communication Infrastructure for
                 Heterogeneous Computing Architectures",
  journal =      j-JETC,
  volume =       "11",
  number =       "4",
  pages =        "42:1--42:??",
  month =        apr,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2700238",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Tue Apr 28 05:59:37 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Recently, the graphics processing unit (GPU) has made
                 significant progress as a general-purpose parallel
                 processor. The CPU and GPU cooperate together to solve
                 data-parallel and control-intensive real-world
                 applications in an optimized fashion. For example,
                 emerging heterogeneous computing architectures such as
                 Intel Sandy Bridge and AMD Fusion integrate the
                 functionality of the CPU and GPU in a single die.
                 However, the single-die CPU-GPU heterogeneous computing
                 architecture faces the challenge of tight budget of die
                 area. The conventional homogeneous interconnect fails
                 to provide satisfactory performance by fully exploiting
                 the given area budget in the heterogeneous processing
                 era. In this article, we aim to implement an
                 interconnect network within an area budget for a
                 CPU-GPU heterogeneous computing architecture. We
                 propose iConn, a 2D mesh-style on-chip heterogeneous
                 communication infrastructure. In iConn, a set of GPU
                 logical units such as the stream processors, the
                 texture units, and the rendering output units form a
                 computing unit (CU). Differing from conventional
                 homogeneous router design, iConn adopts nonuniform
                 on-chip routers in order to meet the unique
                 communication demands from each single CPU and CU. The
                 routers can also dynamically allocate their buffers
                 across all virtual channels (VCs) to meet the latency
                 requirements of CPUs and CUs. Moreover, the memory
                 controller scheduling algorithm is modified from
                 traditional load-over-store scheduling in order to
                 prioritize the traffic. Our simulation results show
                 that iConn improves the performance of CPUs by 23.0\%
                 and CUs by 9.4\%.",
  acknowledgement = ack-nhfb,
  articleno =    "42",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Khayambashi:2015:ARA,
  author =       "Misagh Khayambashi and Pooria M. Yaghini and Ashkan
                 Eghbal and Nader Bagherzadeh",
  title =        "Analytical Reliability Analysis of {$3$D} {NoC} under
                 {TSV} Failure",
  journal =      j-JETC,
  volume =       "11",
  number =       "4",
  pages =        "43:1--43:??",
  month =        apr,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2700236",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Tue Apr 28 05:59:37 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "The network-on-chip (NoC) technology allows for
                 integration of a manycore design on a single chip for
                 higher efficiency and scalability. Three-dimensional
                 (3D) NoCs offer several advantages over two-dimensional
                 (2D) NoCs. Through-silicon via (TSV) technology is one
                 of the candidates for implementation of 3D NoCs. TSV
                 reliability analysis is still challenging for 3D NoC
                 designers because of their unique electrical, thermal,
                 and physical characteristics. After providing an
                 overview of common TSV issues, this article aims to
                 define a reliability criterion for NoC and provide a
                 framework for quantifying this reliability as it
                 relates to TSV issues. TSV issues are modeled as a
                 time-invariant failure probability. Also, a reliability
                 criterion for TSV-based NoC is defined. The
                 relationship between NoC reliability and TSV failure is
                 quantified. For the first time, the reliability
                 criterion is reduced to a tractable closed-form
                 expression that requires a single Monte Carlo
                 simulation. Importantly, the Monte Carlo simulation
                 depends only on network geometry. To demonstrate our
                 proposed method, the reliability criterion of a simple
                 8$ \times $8$ \times $8 NoC supported by an 8$ \times
                 $8$ \times $7 network of TSVs is calculated.",
  acknowledgement = ack-nhfb,
  articleno =    "43",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Pang:2015:MLN,
  author =       "Jun Pang and Christopher Dwyer and Alvin R. Lebeck",
  title =        "{mNoC}: Large Nanophotonic Network-on-Chip Crossbars
                 with Molecular Scale Devices",
  journal =      j-JETC,
  volume =       "12",
  number =       "1",
  pages =        "1:1--1:??",
  month =        jul,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2700241",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Tue Aug 4 07:26:23 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Moore's law and the continuity of device scaling have
                 led to an increasing number of cores/nodes on a chip,
                 creating a need for new mechanisms to achieve
                 high-performance and power-efficient Network-on-Chip
                 (NoC). Nanophotonics based NoCs provide for higher
                 bandwidth and more power efficient designs than
                 electronic networks. Present approaches often use an
                 external laser source, ring resonators, and waveguides.
                 However, they still suffer from important limitations:
                 large static power consumption, and limited network
                 scalability. In this article, we explore the use of
                 emerging molecular scale devices to construct
                 nanophotonic networks: Molecular-scale Network-on-Chip
                 (mNoC). We leverage on-chip emitters such as quantum
                 dot LEDs, which provide electrical to optical signal
                 modulation, and chromophores, which provide optical
                 signal filtering for receivers. These devices replace
                 the ring resonators and the external laser source used
                 in contemporary nanophotonic NoCs. They reduce energy
                 consumption or enable scaling to larger crossbars for a
                 reduced energy budget. We present a Single Writer
                 Multiple Reader (SWMR) bus based crossbar mNoC. Our
                 evaluation shows that an mNoC can achieve more than
                 88\% reduction in energy for a $ 64 \times 64 $
                 crossbar compared to similar ring resonator based
                 designs. Additionally, an mNoC can scale to a $ 256
                 \times 256 $ crossbar with an average 10\% performance
                 improvement and 54\% energy reduction.",
  acknowledgement = ack-nhfb,
  articleno =    "1",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Hossain:2015:MGN,
  author =       "Nahid M. Hossain and Masud H. Chowdhury",
  title =        "Multilayer Graphene Nanoribbon and Carbon Nanotube
                 Based Floating Gate Transistor for Nonvolatile Flash
                 Memory",
  journal =      j-JETC,
  volume =       "12",
  number =       "1",
  pages =        "2:1--2:??",
  month =        jul,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2701428",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Tue Aug 4 07:26:23 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Floating gate transistor is the fundamental building
                 block of nonvolatile flash memory, which is one of the
                 most widely used memory gadgets in modern micro and
                 nano electronic applications. Recently there has been a
                 surge of interest to introduce a new generation of
                 memory devices using graphene nanotechnology. In this
                 article, we present a new floating gate transistor
                 (FGT) design based on multilayer graphene nanoribbon
                 (MLGNR) and carbon nanotube (CNT). In the proposed FGT,
                 a MLGNR structure would be used as the channel of the
                 field effect transistor (FET) and a layer of CNTs would
                 be used as the floating gate. We have performed an
                 analysis of the programming and erasing mechanism in
                 the floating gate and its dependence on the applied
                 control gate voltages. Based on our analysis we have
                 observed that proposed graphene based floating gate
                 transistor could be operated at a low voltage compared
                 to conventional silicon based floating gate devices. We
                 have presented detail analysis of the operation and the
                 programming and erasing processes of the proposed FGT;
                 the dependency of the programming and erasing current
                 density on different parameters; and the impact of
                 scaling the thicknesses of the control and tunneling
                 oxides. To perform these analyses we have developed
                 equivalent models for device capacitances.",
  acknowledgement = ack-nhfb,
  articleno =    "2",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Ghofrani:2015:LPV,
  author =       "Amirali Ghofrani and Miguel-Angel Lastras-Monta{\~n}o
                 and Siddharth Gaba and Melika Payvand and Wei Lu and
                 Luke Theogarajan and Kwang-Ting Cheng",
  title =        "A Low-Power Variation-Aware Adaptive Write Scheme for
                 Access-Transistor-Free Memristive Memory",
  journal =      j-JETC,
  volume =       "12",
  number =       "1",
  pages =        "3:1--3:??",
  month =        jul,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2717313",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Tue Aug 4 07:26:23 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Recent advances in access-transistor-free memristive
                 crossbars have demonstrated the potential of memristor
                 arrays as high-density and ultra-low-power memory.
                 However, with considerable variations in the write-time
                 characteristics of individual memristors, conventional
                 fixed-pulse write schemes cannot guarantee reliable
                 completion of the write operations and waste
                 significant amount of energy. We propose an adaptive
                 write scheme that adaptively adjusts the write pulses
                 to address such variations in memristive arrays,
                 resulting in $ 7 \times $--$ 11 \times $ average energy
                 saving in our case studies. Our scheme embeds an online
                 monitor to detect the completion of a write operation
                 and takes into account the parasitic effect of
                 line-shared devices in access-transistor-free
                 crossbars. This feature also helps shorten the test
                 time of memory march algorithms by eliminating the need
                 of a verifying read right after a write, which is
                 commonly employed in the test sequences of march
                 algorithms.",
  acknowledgement = ack-nhfb,
  articleno =    "3",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Venkatesan:2015:EEA,
  author =       "Rangharajan Venkatesan and Mrigank Sharad and Kaushik
                 Roy and Anand Raghunathan",
  title =        "Energy-Efficient All-Spin Cache Hierarchy Using
                 Shift-Based Writes and Multilevel Storage",
  journal =      j-JETC,
  volume =       "12",
  number =       "1",
  pages =        "4:1--4:??",
  month =        jul,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2723165",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Tue Aug 4 07:26:23 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Spintronic memories are considered to be promising
                 candidates for future on-chip memories due to their
                 high density, nonvolatility, and near-zero leakage.
                 However, they also face challenges such as high write
                 energy and latency and limited read speed due to
                 single-ended sensing. Further, the conflicting
                 requirements of read and write operations lead to
                 stringent design constraints that severely compromises
                 their benefits. Recently, domain wall memory was
                 proposed as a spintronic memory that has a potential
                 for very high density by storing multiple bits in the
                 domains of a ferromagnetic nanowire. While reliable
                 operation of DWM memory with multiple domains faces
                 many challenges, single-bit cells that utilize domain
                 wall motion for writes have been experimentally
                 demonstrated [Fukami et al. 2009]. This bit-cell, which
                 we refer to as Domain Wall Memory with Shift-based
                 Write (DWM-SW), achieves improved write efficiency and
                 features decoupled read-write paths, enabling
                 independent optimizations of read and write operations.
                 However, these benefits are achieved at the cost of
                 sacrificing the original goal of improved density. In
                 this work, we explore multilevel storage as a new
                 direction to enhance the density benefits of DWM-SW. At
                 the device level, we propose a new device--multilevel
                 DWM with shift-based write (ML-DWM-SW)--that is capable
                 of storing 2 bits in a single device. At the circuit
                 level, we propose a ML-DWM-SW based bit-cell design and
                 layout. The ML-DWM-SW bit-cell incurs no additional
                 area overhead compared to the DWM-SW bit-cell despite
                 storing an additional bit, thereby achieving roughly
                 twice the density. However, it requires a two-step
                 write operation and has data-dependent read and write
                 energies, which pose unique challenges. To address
                 these issues, we propose suitable architectural
                 optimizations: (i) intra-word interleaving and (ii) bit
                 encoding. We design ``all-spin'' cache architectures
                 using the proposed ML-DWM-SW bit-cell for both general
                 purpose processors as well as general purpose graphics
                 processing units (GPGPUs). We perform an iso-capacity
                 replacement of SRAM with spintronic memories and study
                 the energy and area benefits at iso-performance
                 conditions. For general purpose processors, the
                 ML-DWM-SW cache achieves 10X reduction in energy and
                 4.4X reduction in cache area compared to an SRAM cache
                 and 2X and 1.7X reduction in energy and area,
                 respectively, compared to an STT-MRAM cache. For
                 GPGPUs, the ML-DWM-SW cache achieves 5.3X reduction in
                 energy and 3.6X area reduction compared to SRAM and
                 3.5X energy reduction and 1.9X area reduction compared
                 to STT-MRAM.",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Park:2015:MME,
  author =       "Kyu Ho Park and Woomin Hwang and Hyunchul Seok and
                 Chulmin Kim and Dong-jae Shin and Dong Jin Kim and Min
                 Kyu Maeng and Seong Min Kim",
  title =        "{MN-MATE}: Elastic Resource Management of Manycores
                 and a Hybrid Memory Hierarchy for a Cloud Node",
  journal =      j-JETC,
  volume =       "12",
  number =       "1",
  pages =        "5:1--5:??",
  month =        jul,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2701429",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Tue Aug 4 07:26:23 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Recent advent of manycore system increases needs for
                 larger but faster memory hierarchy. Emerging next
                 generation memories such as on-chip DRAM and
                 nonvolatile memory (NVRAM) are promising candidates for
                 replacement of DRAM-only main memory. Combined with the
                 manycore trends, it gives an opportunity to rethink
                 conventional resource management system with a memory
                 hierarchy for a single cloud node. In an attempt to
                 mitigate the energy and memory problems, we propose
                 MN-MATE, an elastic resource management architecture
                 for a single cloud node with manycores, on-chip DRAM,
                 and large size of off-chip DRAM and NVRAM. In MN-MATE,
                 the hypervisor places consolidated VMs and balances
                 memory among them. Based on the monitored information
                 about the allocated memory, a guest OS co-schedules
                 tasks accessing different types of memory with
                 complementary access intensity. Polymorphic management
                 of DRAM hierarchy accelerates average memory access
                 speed inside each guest OS. A guest OS reduces energy
                 consumption with small performance loss based on the
                 NVRAM-aware data placement policy and the hybrid page
                 cache. A new lightweight kernel is developed to reduce
                 the overhead from the guest OS for scientific
                 applications. Experiment results show that our
                 techniques in MN-MATE platform improve system
                 performance and reduce energy consumption.",
  acknowledgement = ack-nhfb,
  articleno =    "5",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Wang:2015:WAS,
  author =       "Jue Wang and Yuan Xie",
  title =        "A Write-Aware {STTRAM}-Based Register File
                 Architecture for {GPGPU}",
  journal =      j-JETC,
  volume =       "12",
  number =       "1",
  pages =        "6:1--6:??",
  month =        jul,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2700230",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Tue Aug 4 07:26:23 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "The massively parallel processing capacity of GPGPUs
                 requires a large register file (RF), and its size keeps
                 increasing to support more concurrent threads from
                 generation to generation. Using traditional SRAM-based
                 RFs, there are concerns in both area cost and energy
                 consumption, and soon they will become unrealistic. In
                 this work, we analyze the feasibility of using
                 STTRAM-based RF designs, which have benefits in terms
                 of smaller silicon area and zero standby leakage power.
                 However, STTRAM long write latency and high write
                 energy bring new challenges. Therefore, we propose a
                 write-aware STTRAM-based RF architecture (WarRF), which
                 contains two techniques: Split Bank Write modifies the
                 arbitrator design to increase the parallelism of read
                 and write accesses in the same bank; Write Pool reduces
                 the number of repeated write accesses to RFs. Our
                 experiment shows that the performance of STTRAM-based
                 RF is improved by 13\% and up to 23\% after adopting
                 WarRF. In addition, the energy consumption is reduced
                 by 38\% on average compared to SRAM-based RFs.",
  acknowledgement = ack-nhfb,
  articleno =    "6",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Romani:2015:SSC,
  author =       "Aldo Romani and Matteo Filippi and Michele Dini and
                 Marco Tartagni",
  title =        "A Sub-$ \mu $ A Stand-By Current Synchronous Electric
                 Charge Extractor for Piezoelectric Energy Harvesting",
  journal =      j-JETC,
  volume =       "12",
  number =       "1",
  pages =        "7:1--7:??",
  month =        jul,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2700244",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Tue Aug 4 07:26:23 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "In the field of energy harvesting there is a growing
                 interest in power management circuits with intrinsic
                 sub-$ \mu $ A current consumptions, in order to operate
                 efficiently with very low levels of available power. In
                 this context, integrated circuits proved to be a viable
                 solution with high associated nonrecurring costs and
                 design risks. As an alternative, this article presents
                 a fully autonomous and battery-less circuit solution
                 for piezoelectric energy harvesting based on discrete
                 components in a low-cost PCB technology, which achieves
                 a comparable performance in a $ 32 \times 43 $ mm$^2$
                 footprint. The power management circuit implements
                 synchronous electric charge extraction (SECE) with a
                 passive bootstrap circuit from fully discharged states.
                 Circuit characterization showed that the circuit
                 consumes less than 1 $ \mu $A with a 3V output and may
                 achieve energy conversion efficiencies of up to 85\%.
                 In addition, the circuit is specifically designed for
                 operating with input and output voltages up to 20V,
                 which grants a significant flexibility in the choice of
                 transducers and energy storage capacitors.",
  acknowledgement = ack-nhfb,
  articleno =    "7",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Jayakumar:2015:QHS,
  author =       "Hrishikesh Jayakumar and Arnab Raha and Woo Suk Lee
                 and Vijay Raghunathan",
  title =        "{QuickRecall}: a {HW\slash SW} Approach for Computing
                 across Power Cycles in Transiently Powered Computers",
  journal =      j-JETC,
  volume =       "12",
  number =       "1",
  pages =        "8:1--8:??",
  month =        jul,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2700249",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Tue Aug 4 07:26:23 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Transiently Powered Computers (TPCs) are a new class
                 of batteryless embedded systems that depend solely on
                 energy harvested from external sources for performing
                 computations. Enabling long-running computations on
                 TPCs is a major challenge due to the highly
                 intermittent nature of the power supply (often bursts
                 of {$<$} 100ms), resulting in frequent system reboots.
                 Prior work seeks to address this issue by frequently
                 checkpointing system state in flash memory, preserving
                 it across power cycles. However, this involves a
                 substantial overhead due to the high erase/write times
                 of flash memory. This article proposes the use of
                 Ferroelectric RAM (FRAM), an emerging nonvolatile
                 memory technology that combines the benefits of SRAM
                 and flash, to seamlessly enable long-running
                 computations in TPCs. We propose a lightweight, in-situ
                 checkpointing technique for TPCs using FRAM that
                 consumes only 30 nJ while decreasing the time taken for
                 saving and restoring a checkpoint to only 21.06 $ \mu $
                 s, which is over two orders of magnitude lower than the
                 corresponding overhead using flash. We have implemented
                 and evaluated our technique, QuickRecall, using the TI
                 MSP430FR5739 FRAM-enabled microcontroller. Experimental
                 results show that our highly-efficient checkpointing
                 translate to significant speedup ($ 1.25 \times $--$
                 8.4 \times $) in program execution time and reduction
                 ($ \approx 3 \times $) in application-level energy
                 consumption.",
  acknowledgement = ack-nhfb,
  articleno =    "8",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Chien:2015:FTO,
  author =       "Chia-Hung Chien and Rodney {Van Meter} and Sy-Yen
                 Kuo",
  title =        "Fault-Tolerant Operations for Universal Blind Quantum
                 Computation",
  journal =      j-JETC,
  volume =       "12",
  number =       "1",
  pages =        "9:1--9:??",
  month =        jul,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2700248",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Tue Aug 4 07:26:23 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Blind quantum computation is an appealing use of
                 quantum information technology because it can conceal
                 both the client's data and the algorithm itself from
                 the server. However, problems need to be solved in the
                 practical use of blind quantum computation and
                 fault-tolerance is a major challenge. Broadbent et al.
                 proposed running error correction over blind quantum
                 computation, and Morimae and Fujii proposed using
                 fault-tolerant entangled qubits as the resource for
                 blind quantum computation. Both approaches impose
                 severe demands on the teleportation channel, the former
                 requiring unrealistic data rates and the latter
                 near-perfect fidelity. To extend the application range
                 of blind quantum computation, we suggest that Alice
                 send input qubits encoded with error correction code
                 instead of single input qubits. Two fault-tolerant
                 protocols are presented and we showed the trade-off of
                 the computational overhead using the ten-bit quantum
                 carry-lookahead adder as an example. Though these two
                 fault-tolerant protocols require the client to have
                 more quantum computing ability than using approaches
                 from prior work, they provide better fault-tolerance
                 when the client and the server are connected by
                 realistic quantum repeater networks.",
  acknowledgement = ack-nhfb,
  articleno =    "9",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Cheng:2015:SSC,
  author =       "Ching-Hwa Cheng",
  title =        "{SCKVdd}: a Scalable Clock-Controlled Self-Stabilized
                 Voltage Technique for Low Power {CMOS} Digital
                 Circuits",
  journal =      j-JETC,
  volume =       "12",
  number =       "1",
  pages =        "10:1--10:??",
  month =        jul,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2790754",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Tue Aug 4 07:26:23 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "It has been proposed that small amounts of energy
                 dissipate when transfer through a rising Vdd. In
                 typical power gate circuits, the PMOS transistors
                 (P$_{SW}$ ) reduce the leakage of power by shutting off
                 outer Vdd to the idle blocks. We expand this technique
                 by utilizing active P$_{SW}$, which are turned on and
                 off by a clock signal. The proposed SCKVdd technique
                 combines the power source gated mechanism and clock
                 signal to generate stable progressive rising voltage to
                 suppress peak and average currents effectively. The
                 SCKVdd technique is a scalable, clock-controlled,
                 self-stabilized voltage technique. This technique is
                 easily implemented in generic digital circuits to
                 reduce power dissipation. A normal CMOS circuit shows a
                 dynamic power consumption increase proportional to the
                 clock frequency. SCKVdd results in a lower-than-usual
                 frequency dependency, and is suitable for high speed
                 clock circuits. SCKVdd can be integrated with
                 frequency, voltage scaling and an activated P$_{SW}$
                 number to implement an efficient power-performance
                 trade-off mechanism. In experiments that investigated
                 constant Vdd for MPEG VLD chips, power dissipation
                 savings were in the range of 42\% to 54\% with only a
                 small delay penalty.",
  acknowledgement = ack-nhfb,
  articleno =    "10",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Todri-Sanial:2015:GES,
  author =       "Aida Todri-Sanial and Sanjukta Bhanja",
  title =        "Guest Editorial: Special Issue on Advances in Design
                 of Ultra-Low Power Circuits and Systems in Emerging
                 Technologies",
  journal =      j-JETC,
  volume =       "12",
  number =       "2",
  pages =        "11:1--11:??",
  month =        aug,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2756554",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Tue Sep 8 18:25:16 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  acknowledgement = ack-nhfb,
  articleno =    "11",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Gaillardon:2015:SLP,
  author =       "Pierre-Emmanuel Gaillardon and Edith Beigne and
                 Suzanne Lesecq and Giovanni {De Micheli}",
  title =        "A Survey on Low-Power Techniques with Emerging
                 Technologies: From Devices to Systems",
  journal =      j-JETC,
  volume =       "12",
  number =       "2",
  pages =        "12:1--12:??",
  month =        aug,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2714566",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Tue Sep 8 18:25:16 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Nowadays, power consumption is one of the main
                 limitations of electronic systems. In this context,
                 novel and emerging devices provide new opportunities to
                 extend the trend toward low-power design. In this
                 survey article, we present a transversal survey on
                 energy-efficient techniques ranging from devices to
                 architectures. The actual trends of device research,
                 with fully depleted planar devices, tri-gate
                 geometries, and gate-all-around structures, allows us
                 to reach an increasingly higher level of performance
                 while reducing the associated power. In addition,
                 beyond the simple device property enhancements,
                 emerging devices also lead to innovations at the
                 circuit and architectural levels. In particular,
                 devices whose properties can be tuned through
                 additional terminals enable a fine and dynamic control
                 of device threshold. They also enable designers to
                 realize logic gates and to implement power-related
                 techniques in a compact way unreachable to standard
                 technologies. These innovations reduce power
                 consumption at the gate level and unlock new means of
                 actuation in architectural solutions like adaptive
                 voltage and frequency scaling.",
  acknowledgement = ack-nhfb,
  articleno =    "12",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Sitik:2015:FBL,
  author =       "Can Sitik and Emre Salman and Leo Filippini and Sung
                 Jun Yoon and Baris Taskin",
  title =        "{FinFET}-Based Low-Swing Clocking",
  journal =      j-JETC,
  volume =       "12",
  number =       "2",
  pages =        "13:1--13:??",
  month =        aug,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2701617",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Tue Sep 8 18:25:16 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "A low-swing clocking methodology is introduced to
                 achieve low-power operation at 20nm FinFET technology.
                 Low-swing clock trees are used in existing
                 methodologies in order to decrease the dynamic power
                 consumption in a trade-off for 3 issues: (1) the effect
                 of leakage power consumption, which is becoming more
                 dominant when the process scales sub-32nm; (2) the
                 increase in insertion delay, resulting in a high clock
                 skew; and (3) the difficulty in driving the existing
                 DFF sinks with a low-swing clock signal without a
                 timing violation. In this article, a FinFET-based
                 low-swing clocking methodology is introduced to
                 preserve the dynamic power savings of low-swing
                 clocking while minimizing these three negative effects,
                 facilitated through an efficient use of FinFET
                 technology. At scaled performance constraints, the
                 proposed methodology at 20nm FinFET leads to 42\% total
                 power savings (clock network+DFF) compared to a
                 FinFET-based full-swing counterpart at the same
                 frequency (3 GHz), thanks to the dynamic power savings
                 of low-swing clocking and 3\% power savings compared to
                 a CMOS-based low-swing implementation running at the
                 half frequency (1.5 GHz), thanks to the leakage power
                 savings of FinFET technology.",
  acknowledgement = ack-nhfb,
  articleno =    "13",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Zhang:2015:DCP,
  author =       "Tiansheng Zhang and Jie Meng and Ayse K. Coskun",
  title =        "Dynamic Cache Pooling in {$3$D} Multicore Processors",
  journal =      j-JETC,
  volume =       "12",
  number =       "2",
  pages =        "14:1--14:??",
  month =        aug,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2700247",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Tue Sep 8 18:25:16 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Resource pooling, where multiple architectural
                 components are shared among cores, is a promising
                 technique for improving system energy efficiency and
                 reducing total chip area. 3D stacked multicore
                 processors enable efficient pooling of cache resources
                 owing to the short interconnect latency between
                 vertically stacked layers. This article first
                 introduces a 3D multicore architecture that provides
                 poolable cache resources. We then propose a runtime
                 management policy to improve energy efficiency in 3D
                 systems by utilizing the flexible heterogeneity of
                 cache resources. Our policy dynamically allocates jobs
                 to cores on the 3D system while partitioning cache
                 resources based on cache hungriness of the jobs. We
                 investigate the impact of the proposed cache resource
                 pooling architecture and management policy in 3D
                 systems, both with and without on-chip DRAM. We
                 evaluate the performance, energy efficiency, and
                 thermal behavior for a wide range of workloads running
                 on 3D systems. Experimental results demonstrate that
                 the proposed architecture and policy reduce system
                 energy-delay product (EDP) and energy-delay-area
                 product (EDAP) by 18.8\% and 36.1\% on average,
                 respectively, in comparison to 3D processors with
                 static cache sizes.",
  acknowledgement = ack-nhfb,
  articleno =    "14",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Khasanvis:2015:LPH,
  author =       "Santosh Khasanvis and K. M. Masum Habib and Mostafizur
                 Rahman and Roger Lake and Csaba Andras Moritz",
  title =        "Low-Power Heterogeneous Graphene Nanoribbon-{CMOS}
                 Multistate Volatile Memory Circuit",
  journal =      j-JETC,
  volume =       "12",
  number =       "2",
  pages =        "15:1--15:??",
  month =        aug,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2700233",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Tue Sep 8 18:25:16 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Graphene is an emerging nanomaterial believed to be a
                 potential candidate for post-Si nanoelectronics due to
                 its exotic properties. Recently, a new graphene
                 nanoribbon crossbar (xGNR) device was proposed which
                 exhibits negative differential resistance (NDR). In
                 this article, a multistate memory design is presented
                 that can store multiple bits in a single cell enabled
                 by this xGNR device, called graphene nanoribbon
                 tunneling random access memory (GNTRAM). An approach to
                 increase the number of bits per cell is explored
                 alternative to physical scaling to overcome CMOS SRAM
                 limitations. A comprehensive design for quaternary
                 GNTRAM is presented as a baseline, implemented with a
                 heterogeneous integration between graphene and CMOS.
                 Sources of leakage and approaches to mitigate them are
                 investigated. This design is extensively benchmarked
                 against 16nm CMOS SRAMs and 3T DRAM. The proposed
                 quaternary cell shows up to 2.27$ \times $ density
                 benefit versus 16nm CMOS SRAMs and 1.8$ \times $ versus
                 3T DRAM. It has comparable read performance and is
                 power efficient up to 1.32$ \times $ during active
                 period and 818$ \times $ during standby against
                 high-performance SRAMs. Multistate GNTRAM has the
                 potential to realize high-density low-power nanoscale
                 embedded memories. Further improvements may be possible
                 by using graphene more extensively, as graphene
                 transistors become available in the future.",
  acknowledgement = ack-nhfb,
  articleno =    "15",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Kang:2015:SEU,
  author =       "Wang Kang and Yue Zhang and Zhaohao Wang and
                 Jacques-Olivier Klein and Claude Chappert and
                 Dafin{\'e} Ravelosona and Gefei Wang and Youguang Zhang
                 and Weisheng Zhao",
  title =        "Spintronics: Emerging Ultra-Low-Power Circuits and
                 Systems beyond {MOS} Technology",
  journal =      j-JETC,
  volume =       "12",
  number =       "2",
  pages =        "16:1--16:??",
  month =        aug,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2663351",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Tue Sep 8 18:25:16 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Conventional MOS integrated circuits and systems
                 suffer serve power and scalability challenges as
                 technology nodes scale into ultra-deep-micron
                 technology nodes (e.g., below 40nm). Both static and
                 dynamic power dissipations are increasing, caused
                 mainly by the intrinsic leakage currents and large data
                 traffic. Alternative approaches beyond
                 charge-only-based electronics, and in particular,
                 spin-based devices, show promising potential to
                 overcome these issues by adding the spin freedom of
                 electrons to electronic circuits. Spintronics provides
                 data non-volatility, fast data access, and low-power
                 operation, and has now become a hot topic in both
                 academia and industry for achieving ultra-low-power
                 circuits and systems. The ITRS report on emerging
                 research devices identified the magnetic tunnel
                 junction (MTJ) nanopillar (one of the Spintronics
                 nanodevices) as one of the most promising technologies
                 to be part of future micro-electronic circuits. In this
                 review we will give an overview of the status and
                 prospects of spin-based devices and circuits that are
                 currently under intense investigation and development
                 across the world, and address particularly their merits
                 and challenges for practical applications. We will also
                 show that, with a rapid development of Spintronics,
                 some novel computing architectures and paradigms beyond
                 classic Von-Neumann architecture have recently been
                 emerging for next-generation ultra-low-power circuits
                 and systems.",
  acknowledgement = ack-nhfb,
  articleno =    "16",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Azghadi:2015:PST,
  author =       "Mostafa Rahimi Azghadi and Saber Moradi and Daniel B.
                 Fasnacht and Mehmet Sirin Ozdas and Giacomo Indiveri",
  title =        "Programmable Spike-Timing-Dependent Plasticity
                 Learning Circuits in Neuromorphic {VLSI}
                 Architectures",
  journal =      j-JETC,
  volume =       "12",
  number =       "2",
  pages =        "17:1--17:??",
  month =        aug,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2658998",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Tue Sep 8 18:25:16 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Hardware implementations of spiking neural networks
                 offer promising solutions for computational tasks that
                 require compact and low-power computing technologies.
                 As these solutions depend on both the specific network
                 architecture and the type of learning algorithm used,
                 it is important to develop spiking neural network
                 devices that offer the possibility to reconfigure their
                 network topology and to implement different types of
                 learning mechanisms. Here we present a neuromorphic
                 multi-neuron VLSI device with on-chip programmable
                 event-based hybrid analog/digital circuits; the
                 event-based nature of the input/output signals allows
                 the use of address-event representation infrastructures
                 for configuring arbitrary network architectures, while
                 the programmable synaptic efficacy circuits allow the
                 implementation of different types of spike-based
                 learning mechanisms. The main contributions of this
                 article are to demonstrate how the programmable
                 neuromorphic system proposed can be configured to
                 implement specific spike-based synaptic plasticity
                 rules and to depict how it can be utilised in a
                 cognitive task. Specifically, we explore the
                 implementation of different spike-timing plasticity
                 learning rules online in a hybrid system comprising a
                 workstation and when the neuromorphic VLSI device is
                 interfaced to it, and we demonstrate how, after
                 training, the VLSI device can perform as a standalone
                 component (i.e., without requiring a computer), binary
                 classification of correlated patterns.",
  acknowledgement = ack-nhfb,
  articleno =    "17",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Graziano:2015:PVE,
  author =       "Mariagrazia Graziano and Azzurra Pulimeno and Ruiyu
                 Wang and Xiang Wei and Massimo Ruo Roch and Gianluca
                 Piccinini",
  title =        "Process Variability and Electrostatic Analysis of
                 Molecular {QCA}",
  journal =      j-JETC,
  volume =       "12",
  number =       "2",
  pages =        "18:1--18:??",
  month =        aug,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2738041",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Tue Sep 8 18:25:16 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Molecular quantum-dot cellular automata (mQCA) is an
                 emerging paradigm for nanoscale computation. Its
                 revolutionary features are the expected operating
                 frequencies (THz), the high device densities, the
                 noncryogenic working temperature, and, above all, the
                 limited power densities. The main drawback of this
                 technology is a consequence of one of its very main
                 advantages, that is, the extremely small size of a
                 single molecule. Device prototyping and the fabrication
                 of a simple circuit are limited by lack of control in
                 the technological process [Pulimeno et al. 2013a].
                 Moreover, high defectivity might strongly impact the
                 correct behavior of mQCA devices. Another challenging
                 point is the lack of a solid method for analyzing and
                 simulating mQCA behavior and performance, either in
                 ideal or defective conditions. Our contribution in this
                 article is threefold: (i) We identify a methodology
                 based on both ab-initio simulations and post-processing
                 of data for analyzing an mQCA system adopting an
                 electronic point of view (we baptized this method as
                 ``MoSQuiTo''); (ii) we assess the performance of an
                 mQCA device (in this case, a bis- ferrocene molecule)
                 working in nonideal conditions, using as a reference
                 the information on fabrication-critical issues and on
                 the possible defects that we are obtaining while
                 conducting our own ongoing experiments on mQCA: (iii)
                 we determine and assess the electrostatic energy stored
                 in a bis-ferrocene molecule both in an oxidized and
                 reduced form. Results presented here consist of
                 quantitative information for an mQCA device working in
                 manifold driving conditions and subjected to defects.
                 This information is given in terms of: (a) output
                 voltage; (b) safe operating area (SOA); (c)
                 electrostatic energy; and (d) relation between SOA and
                 energy, that is, possible energy reduction subject to
                 reliability and functionality constraints. The whole
                 analysis is a first fundamental step toward the study
                 of a complex mQCA circuit. It gives important
                 suggestions on possible improvements of the
                 technological processes. Moreover, it starts an
                 interesting assessment on the energy of an mQCA, one of
                 the most promising features of this technology.",
  acknowledgement = ack-nhfb,
  articleno =    "18",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Le:2015:END,
  author =       "Trong Nhan Le and Alain Pegatoquet and Olivier Berder
                 and Olivier Sentieys and Arnaud Carer",
  title =        "Energy-Neutral Design Framework for
                 Supercapacitor-Based Autonomous Wireless Sensor
                 Networks",
  journal =      j-JETC,
  volume =       "12",
  number =       "2",
  pages =        "19:1--19:??",
  month =        aug,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2787512",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Tue Sep 8 18:25:16 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "To design autonomous wireless sensor networks (WSNs)
                 with a theoretical infinite lifetime, energy harvesting
                 (EH) techniques have been recently considered as
                 promising approaches. Ambient sources can provide
                 everlasting additional energy for WSN nodes and exclude
                 their dependence on battery. In this article, an
                 efficient energy harvesting system which is compatible
                 with various environmental sources, such as light,
                 heat, or wind energy, is proposed. Our platform takes
                 advantage of double-level capacitors not only to
                 prolong system lifetime but also to enable robust
                 booting from the exhausting energy of the system.
                 Simulations and experiments show that our
                 multiple-energy-sources converter (MESC) can achive
                 booting time in order of seconds. Although capacitors
                 have virtual recharge cycles, they suffer higher
                 leakage compared to rechargeable batteries. Increasing
                 their size can decrease the system performance due to
                 leakage energy. Therefore, an energy-neutral design
                 framework providing a methodology to determine the
                 minimum size of those storage devices satisfying
                 energy-neutral operation (ENO) and maximizing system
                 quality-of-service (QoS) in EH nodes, when using a
                 given energy source, is proposed. Experiments
                 validating this framework are performed on a real WSN
                 platform with both photovoltaic cells and thermal
                 generators in an indoor environment. Moreover,
                 simulations on OMNET++ show that the energy storage
                 optimized from our design framework is utilized up to
                 93.86\%.",
  acknowledgement = ack-nhfb,
  articleno =    "19",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Shi:2015:ISI,
  author =       "Yiyu Shi and Takashi Sato",
  title =        "Introduction to: Special Issue on Cross-Layer System
                 Design",
  journal =      j-JETC,
  volume =       "12",
  number =       "3",
  pages =        "20:1--20:??",
  month =        sep,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2767131",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Tue Sep 22 17:30:11 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  acknowledgement = ack-nhfb,
  articleno =    "20",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{De:2015:ASC,
  author =       "Vivek K. De and Andrew B. Kahng and Tanay Karnik and
                 Bao Liu and Milad Maleki and Lu Wang",
  title =        "Application-Specific Cross-Layer Optimization Based on
                 Predictive Variable-Latency {VLSI} Design",
  journal =      j-JETC,
  volume =       "12",
  number =       "3",
  pages =        "21:1--21:??",
  month =        sep,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2746341",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Tue Sep 22 17:30:11 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Traditional synchronous VLSI design requires that all
                 computations in a logic stage complete in one clock
                 cycle. This leads to increasingly pessimistic design as
                 technology scaling introduces increasingly significant
                 parametric variations that result in an increasing
                 performance variability. Alternatively, by allowing
                 computations in a logic stage to complete in a variable
                 number of clock cycles, variable-latency design
                 provides relaxed timing constraints for average
                 performance, area, and power consumption optimization.
                 In this article, we present improved variable-latency
                 design techniques including: (1) a generic
                 minimum-intrusion variable-latency VLSI design
                 paradigm, (2) a signal probability-based approximate
                 prediction logic construction method for minimum
                 misprediction rate at minimum cost, and (3) an
                 application-specific cross-layer analysis methodology.
                 Our experiments show that the proposed variable-latency
                 design methodology on average reduces the computation
                 latency by 26.80\%(14.65\%) at cost of 0.08\%(3.4\%)
                 area and 0.4\%(2.2\%) energy consumption increase for
                 the integer (floating point) unit of an open-source
                 SPARC V8 processor LEON2 synthesized with a clock-cycle
                 time between 1.97ns(3.49ns) and 5.96ns(13.74ns) based
                 on the 45nm Nangate open cell library, while an
                 automotive application-specific design further achieves
                 an average latency reduction of 41.8\%.",
  acknowledgement = ack-nhfb,
  articleno =    "21",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Patnaik:2015:PPC,
  author =       "Milan Patnaik and Chidhambaranathan R. and Chirag Garg
                 and Arnab Roy and V. R. Devanathan and Shankar
                 Balachandran and V. Kamakoti",
  title =        "{ProWATCh}: a Proactive Cross-Layer Workload-Aware
                 Temperature Management Framework for Low-Power Chip
                 Multi-Processors",
  journal =      j-JETC,
  volume =       "12",
  number =       "3",
  pages =        "22:1--22:??",
  month =        sep,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2753762",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Tue Sep 22 17:30:11 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "With the increase in process variations and diversity
                 in workloads, it is imperative to holistically explore
                 optimization techniques for power and temperature from
                 the circuit layer right up to the compiler/ operating
                 system (OS) layer. This article proposes one such
                 holistic technique, called proactive workload aware
                 temperature management framework for low-power chip
                 multi-processors (ProWATCh). At the compiler level
                 ProWATCh includes two techniques: (1) a novel compiler
                 design for estimating the architectural parameters of a
                 task at compile time; and (2) a model-based technique
                 for dynamic estimation of architectural parameters at
                 runtime. At the OS level ProWATCh integrates two
                 techniques: (1) a workload- and temperature-aware
                 process manager for dynamic distribution of tasks to
                 different cores; and (2) a model predictive
                 control-based task scheduler for generating the
                 efficient sequence of task execution. At the circuit
                 level ProWATCh implements either of two techniques: (1)
                 a workload-aware voltage manager for dynamic supply and
                 body bias voltage assignment for a given frequency in
                 processors that support adaptive body bias (ABB); or
                 (2) a workload-aware frequency governor for efficient
                 assignment of upper and lower frequency bounds for
                 frequency scaling in processors that do not support an
                 ABB. Employing ProWATCh (with voltage manager) on an
                 ABB-compatible 3D OpenSPARC architecture using MiBench
                 benchmarks resulted in an average 18\% (19C) reduction
                 in peak temperature. Evaluating ProWATCh on an existing
                 quad-core Intel Corei7 processor with frequency
                 governor alone (as the processor does not support an
                 ABB interface) resulted in 10\% (8C) reduction in peak
                 temperature when compared to what was obtained using
                 the native Linux 3.0 completely fair scheduler (CFS).
                 To study the effectiveness of the proposed framework
                 across benchmark suites, ProWATCh was evaluated on a
                 quad-core Intel Corei7 processor using CPU SPEC 2006
                 benchmarks which resulted in 7C reduction in peak
                 temperature as compared to the native Linux 3.0 CFS.",
  acknowledgement = ack-nhfb,
  articleno =    "22",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Zhao:2015:STD,
  author =       "Chenyuan Zhao and Bryant T. Wysocki and Yifang Liu and
                 Clare D. Thiem and Nathan R. McDonald and Yang Yi",
  title =        "Spike-Time-Dependent Encoding for Neuromorphic
                 Processors",
  journal =      j-JETC,
  volume =       "12",
  number =       "3",
  pages =        "23:1--23:??",
  month =        sep,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2738040",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Tue Sep 22 17:30:11 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "This article presents our research towards developing
                 novel and fundamental methodologies for data
                 representation using spike-timing-dependent encoding.
                 Time encoding efficiently maps a signal's amplitude
                 information into a spike time sequence that represents
                 the input data and offers perfect recovery for
                 band-limited stimuli. In this article, we pattern the
                 neural activities across multiple timescales and encode
                 the sensory information using time-dependent temporal
                 scales. The spike encoding methodologies for autonomous
                 classification of time-series signatures are explored
                 using near-chaotic reservoir computing. The proposed
                 spiking neuron is compact, low power, and robust. A
                 hardware implementation of these results is expected to
                 produce an agile hardware implementation of time
                 encoding as a signal conditioner for dynamical neural
                 processor designs.",
  acknowledgement = ack-nhfb,
  articleno =    "23",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Barke:2015:CLA,
  author =       "Martin Barke and Ulf Schlichtmann",
  title =        "A Cross-Layer Approach to Measure the Robustness of
                 Integrated Circuits",
  journal =      j-JETC,
  volume =       "12",
  number =       "3",
  pages =        "24:1--24:??",
  month =        sep,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2743022",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Tue Sep 22 17:30:11 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "The demands on system robustness and its immunity
                 against perturbations are getting increasingly
                 important. Nearly everybody has an intuitive
                 understanding of what robustness means, but there is no
                 proper way how to measure robustness of integrated
                 circuits already during the design phase. Therefore, a
                 general cross-layer robustness model and methods to
                 quantitatively measure robustness are presented.
                 Moreover, these methods are refined to predict the
                 robustness against degradation of digital circuits due
                 to aging effects.",
  acknowledgement = ack-nhfb,
  articleno =    "24",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Zhuo:2015:CLA,
  author =       "Cheng Zhuo and Houle Gan and Wei-Kai Shih and Alaeddin
                 A. Aydiner",
  title =        "A Cross-Layer Approach for Early-Stage Power Grid
                 Design and Optimization",
  journal =      j-JETC,
  volume =       "12",
  number =       "3",
  pages =        "25:1--25:??",
  month =        sep,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2700246",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Tue Sep 22 17:30:11 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Power integrity has become increasingly important for
                 sub-32nm designs. Many prior works have discussed power
                 grid design and optimization in the post-layout stage,
                 when design change is inevitably expensive and
                 difficult. In contrast, during the early stage of a
                 development cycle, designers have more flexibility to
                 improve the design quality. However, there are several
                 fundamental challenges at early stage when the design
                 database is not complete, including extraction,
                 modeling, and optimization. This article tackles these
                 fundamental issues of early-stage power grid design
                 from architecture to layout. The proposed methods have
                 been silicon validated on 32nm on-market chips and
                 successfully applied to a 22nm design for its
                 early-stage power grid design. The findings from such
                 practices reveal that, for sub-32nm chips, an intrinsic
                 on-die capacitance and power gate scheme may have more
                 significant impact than expected on power integrity,
                 and needs to be well addressed at early stage.",
  acknowledgement = ack-nhfb,
  articleno =    "25",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Lee:2015:REE,
  author =       "Jinho Lee and Kyungsu Kang and Kiyoung Choi",
  title =        "{REDELF}: an Energy-Efficient Deadlock-Free Routing
                 for {$3$D} {NoCs} with Partial Vertical Connections",
  journal =      j-JETC,
  volume =       "12",
  number =       "3",
  pages =        "26:1--26:??",
  month =        sep,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2751560",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Tue Sep 22 17:30:11 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "3D integrated circuits (3D ICs) using through-silicon
                 vias (TSVs) allow to envision the stacking of dies with
                 different functions and technologies, using as an
                 interconnect backbone a 3D network-on-chip (NoC).
                 However, partial vertical connection in 3D NoCs seems
                 unavoidable because of the large overhead of TSV itself
                 (e.g., large footprint, low fabrication yield,
                 additional fabrication processes) as well as the
                 heterogeneity in dimension. This article proposes an
                 energy-efficient deadlock-free routing algorithm for 3D
                 mesh topologies where vertical connections partially
                 exist. By introducing some rules for selecting
                 elevators (i.e., vertical links between dies), the
                 routing algorithm can eliminate the dedicated virtual
                 channel requirement. In this article, the rules
                 themselves as well as the proof of deadlock freedom are
                 given. By eliminating the virtual channels for deadlock
                 avoidance, the proposed routing algorithm reduces the
                 energy consumption by 38.9\% compared to a conventional
                 routing algorithm. When the virtual channel is used for
                 reducing the head-of-line blocking, the proposed
                 routing algorithm increases performance by up to 23.1\%
                 and 6.9\% on average.",
  acknowledgement = ack-nhfb,
  articleno =    "26",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Zoni:2015:MDP,
  author =       "Davide Zoni and William Fornaciari",
  title =        "Modeling {DVFS} and Power-Gating Actuators for
                 Cycle-Accurate {NoC}-Based Simulators",
  journal =      j-JETC,
  volume =       "12",
  number =       "3",
  pages =        "27:1--27:??",
  month =        sep,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2751561",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Tue Sep 22 17:30:11 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Networks-on-chip (NoCs) are a widely recognized viable
                 interconnection paradigm to support the multi-core
                 revolution. One of the major design issues of multicore
                 architectures is still the power, which can no longer
                 be considered mainly due to the cores, since the NoC
                 contribution to the overall energy budget is relevant.
                 To face both static and dynamic power while balancing
                 NoC performance, different actuators have been
                 exploited in literature, mainly dynamic voltage
                 frequency scaling (DVFS) and power gating. Typically,
                 simulation-based tools are employed to explore the huge
                 design space by adopting simplified models of the
                 components. As a consequence, the majority of
                 state-of-the-art on NoC power-performance optimization
                 do not accurately consider timing and power overheads
                 of actuators, or (even worse) do not consider them at
                 all, with the risk of overestimating the benefits of
                 the proposed methodologies. This article presents a
                 simulation framework for power-performance analysis of
                 multicore architectures with specific focus on the NoC.
                 It integrates accurate power gating and DVFS models
                 encompassing also their timing and power overheads. The
                 value added of our proposal is manyfold: (i) DVFS and
                 power gating actuators are modeled starting from
                 SPICE-level simulations; (ii) such models have been
                 integrated in the simulation environment; (iii) policy
                 analysis support is plugged into the framework to
                 enable assessment of different policies; (iv) a
                 flexible GALS ( globally asynchronous locally
                 synchronous ) support is provided, covering both
                 handshake and FIFO re-synchronization schemas. To
                 demonstrate both the flexibility and extensibility of
                 our proposal, two simple policies exploiting the
                 modeled actuators are discussed in the article.",
  acknowledgement = ack-nhfb,
  articleno =    "27",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Chen:2015:GPF,
  author =       "Xianmin Chen and Niraj K. Jha",
  title =        "{gem5-PVT}: a Framework for {FinFET} System Simulation
                 under {PVT} Variations",
  journal =      j-JETC,
  volume =       "12",
  number =       "3",
  pages =        "28:1--28:??",
  month =        sep,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2755564",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Tue Sep 22 17:30:11 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "FinFET has begun replacing CMOS at the 22nm technology
                 node and beyond. Compared to planar CMOS, FinFET has a
                 higher on-current and lower leakage due to its
                 double-gate structure. A FinFET-based system simulation
                 framework can be very helpful to system architects for
                 early-stage design-space exploration using this new
                 technology. However, such a simulator does not exist.
                 We fill this gap by presenting the details of one such
                 simulation framework, called gem5-PVT, that we have
                 developed. Our simulation framework combines and
                 extends existing lower-level FinFET simulators to
                 support timing, power, and thermal studies of
                 FinFET-based chip multiprocessor systems under process,
                 voltage, and temperature (PVT) variations. It uses a
                 bottom-up modeling approach based on logic/memory cell
                 libraries that have been very accurately characterized
                 using TCAD device simulation. This allows accuracy to
                 bubble up to the system level. The framework is modular
                 and automated, hence enables system designers the
                 flexibility to evaluate various system implementations.
                 It is currently targeted at the 22nm FinFET technology.
                 We report results for two case studies to demonstrate
                 its usefulness. One study shows that more than 62.1$
                 \times $ system-level leakage reduction, at the same
                 performance, is possible when using a particular FinFET
                 logic style. Another study characterizes core-to-core
                 frequency and power variations that result from
                 underlying PVT variations and compares the
                 effectiveness of variation-aware scheduling schemes.",
  acknowledgement = ack-nhfb,
  articleno =    "28",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Bahreini:2015:MMS,
  author =       "Tayebeh Bahreini and Naser Mohammadzadeh",
  title =        "An {MINLP} Model for Scheduling and Placement of
                 Quantum Circuits with a Heuristic Solution Approach",
  journal =      j-JETC,
  volume =       "12",
  number =       "3",
  pages =        "29:1--29:??",
  month =        sep,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2766452",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Tue Sep 22 17:30:11 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Recent works on quantum physical design have pushed
                 the scheduling and placement of quantum circuit into
                 their prominent positions. In this article, a mixed
                 integer nonlinear programming model is proposed for the
                 placement and scheduling of quantum circuits in such a
                 way that latency is minimized. The proposed model
                 determines locations of gates and the sequence of
                 operations. The proposed model is proved reducible to a
                 quadratic assignment problem which is a well-known
                 NP-complete combinatorial optimization problem. Since
                 it is impossible to find the optimal solution of this
                 NP-complete problem for large quantum circuits within a
                 reasonable amount of time, a metaheuristic solution
                 method is developed for the proposed model. Some
                 experiments are conducted to evaluate the performance
                 of the developed solution approach. Experimental
                 results show that the proposed approach improves
                 average latency by about 24.09\% for the attempted
                 benchmarks.",
  acknowledgement = ack-nhfb,
  articleno =    "29",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Rahman:2015:NVR,
  author =       "Mostafizur Rahman and Santosh Khasanvis and Csaba
                 Andras Moritz",
  title =        "Nanowire Volatile {RAM} as an Alternative to {SRAM}",
  journal =      j-JETC,
  volume =       "12",
  number =       "3",
  pages =        "30:1--30:??",
  month =        sep,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2714567",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Tue Sep 22 17:30:11 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Maintaining benefits of CMOS technology scaling is
                 becoming challenging, primarily due to increased
                 manufacturing complexities and unwanted passive power
                 dissipations. This is particularly challenging in SRAM,
                 where manufacturing precision and leakage power control
                 are critical issues. To alleviate these challenges, we
                 proposed a novel volatile memory alternative to SRAM
                 called nanowire volatile RAM (NWRAM). Due to NWRAM's
                 regular grid-based layout and innovative circuit style,
                 manufacturing complexities are reduced and, at the same
                 time, considerable benefits are attained in terms of
                 performance and leakage power reduction. In this
                 article we elaborate NWRAM's circuit aspects and
                 manufacturability, and quantify benefits at 16nm
                 technology node through simulation against
                 state-of-the-art 6T-SRAM and gridded 8T-SRAM designs.
                 Our results show that when lower bounds in design rules
                 are considered, 10T-NWRAM's read and write time are
                 1.38x and 2x faster, and the leakage power is 14x
                 better in comparison to high-performance 6T-SRAM.
                 Similarly the 10T-NWRAM achieves 1.3x and 1.9x read and
                 write performance, and 35x leakage power improvements
                 compared to high-performance 8T-SRAM. 10T-NWRAM's
                 density is comparable to 6T-SRAM and 8T-SRAM for lower
                 bounds, but exhibits higher active power in similar
                 comparisons. This article details all benchmarking
                 results and provides thorough analysis of NWRAM's
                 evaluations.",
  acknowledgement = ack-nhfb,
  articleno =    "30",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Khouzani:2016:FEP,
  author =       "Hoda Aghaei Khouzani and Yuan Xue and Chengmo Yang",
  title =        "Fully Exploiting {PCM} Write Capacity Within Near Zero
                 Cost Through Segment-Based Page Allocation",
  journal =      j-JETC,
  volume =       "12",
  number =       "4",
  pages =        "31:1--31:??",
  month =        jul,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2856423",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Thu Dec 1 09:26:07 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Improving the endurance of phase change memory (PCM)
                 is a fundamental issue when PCM technology is
                 considered as an alternative to main memory usage.
                 Existing wear-leveling techniques overcome this
                 challenge through constantly remapping hot virtual
                 pages, thus engendering a fair amount of extra write
                 operations to PCM and imposing considerable performance
                 and energy overhead. Our observation is that it is
                 unnecessary to fully balance the accesses to different
                 physical page frames during the execution of each
                 process. Instead, since endurance is a lifetime factor,
                 the hot virtual pages of different processes can be
                 mapped to different physical pages in the PCM.
                 Leveraging this property, we develop a wear-resistant
                 page allocation algorithm, which exploits the diverse
                 write characteristics of different program segments to
                 improve PCM write endurance within almost no extra
                 remapping cost in terms of energy and performance. The
                 results of experiments conducted based on SPEC
                 benchmarks show that the proposed technique can prolong
                 PCM lifetime by hundreds of times within nearly zero
                 searching and remapping overhead.",
  acknowledgement = ack-nhfb,
  articleno =    "31",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Layer:2016:RSP,
  author =       "Christophe Layer and Laurent Becker and Kotb Jabeur
                 and Sylvain Claireux and Bernard Dieny and Guillaume
                 Prenat and Gregory {Di Pendina} and Stephane Gros and
                 Pierre Paoli and Virgile Javerliac and Fabrice
                 Bernard-Granger and Loic Decloedt",
  title =        "Reducing System Power Consumption Using Check-Pointing
                 on Nonvolatile Embedded Magnetic Random Access
                 Memories",
  journal =      j-JETC,
  volume =       "12",
  number =       "4",
  pages =        "32:1--32:??",
  month =        jul,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2876507",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Thu Dec 1 09:26:07 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "The most widely used embedded memory technology,
                 static random access memory (SRAM), is heading toward
                 scaling problems in advanced technology nodes due to
                 the leakage currents caused by the quantum tunneling
                 effect. As an alternative, spin-transfer torque
                 magnetic RAM (STT-MRAM) technology shows comparable
                 performance in terms of speed and power consumption and
                 much better performance in terms of density and
                 leakage. Moreover, MRAM brings up new paradigms in
                 system design thanks to its inherent nonvolatility,
                 which allows the definition of new instant-on/off
                 policies and leakage current optimization. Based on our
                 compact model, we have developed a fully characterized
                 system-on-chip from the basic cell up to the system
                 architecture in a 40nm LP hybrid CMOS/magnetic process.
                 Through simulations, first we demonstrate that STT-MRAM
                 is a candidate for the memory part of embedded systems,
                 and second we implement a check-pointing methodology
                 based on the regular interrupt routines of a processor
                 to enable a fast power on and off functionality. Using
                 a synthetic benchmark developed in high-level
                 programming languages intended to be representative of
                 integer system performance, our method shows that
                 having MRAM instead of SRAM in an embedded design
                 brings up important energy savings. The influence of
                 the check-pointing routine on power consumption is
                 finally evaluated with regard to various shutdown and
                 restart behaviors.",
  acknowledgement = ack-nhfb,
  articleno =    "32",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Wu:2016:RCA,
  author =       "Chengwen Wu and Guangyan Zhang and Keqin Li",
  title =        "Rethinking Computer Architectures and Software Systems
                 for Phase-Change Memory",
  journal =      j-JETC,
  volume =       "12",
  number =       "4",
  pages =        "33:1--33:40",
  month =        jul,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2893186",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Thu Dec 1 09:26:07 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "With dramatic growth of data and rapid enhancement of
                 computing powers, data accesses become the bottleneck
                 restricting overall performance of a computer system.
                 Emerging phase-change memory (PCM) is byte-addressable
                 like DRAM, persistent like hard disks and Flash SSD,
                 and about four orders of magnitude faster than hard
                 disks or Flash SSDs for typical file system I/Os. The
                 maturity of PCM from research to production provides a
                 new opportunity for improving the I/O performance of a
                 system. However, PCM also has some weaknesses, for
                 example, long write latency, limited write endurance,
                 and high active energy. Existing processor cache
                 systems, main memory systems, and online storage
                 systems are unable to leverage the advantages of PCM,
                 and/or to mitigate PCM's drawbacks. The reason behind
                 this incompetence is that they are designed and
                 optimized for SRAM, DRAM memory, and hard drives,
                 respectively, instead of PCM memory. There have been
                 some efforts concentrating on rethinking computer
                 architectures and software systems for PCM. This
                 article presents a detailed survey and review of the
                 areas of computer architecture and software systems
                 that are oriented to PCM devices. First, we identify
                 key technical challenges that need to be addressed
                 before this memory technology can be leveraged, in the
                 form of processor cache, main memory, and online
                 storage, to build high-performance computer systems.
                 Second, we examine various designs of computer
                 architectures and software systems that are PCM aware.
                 Finally, we obtain several helpful observations and
                 propose a few suggestions on how to leverage PCM to
                 optimize the performance of a computer system.",
  acknowledgement = ack-nhfb,
  articleno =    "33",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Deb:2016:RSS,
  author =       "Arighna Deb and Debesh K. Das and Hafizur Rahaman and
                 Robert Wille and Rolf Drechsler and Bhargab B.
                 Bhattacharya",
  title =        "Reversible Synthesis of Symmetric Functions with a
                 Simple Regular Structure and Easy Testability",
  journal =      j-JETC,
  volume =       "12",
  number =       "4",
  pages =        "34:1--34:??",
  month =        jul,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2894757",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Thu Dec 1 09:26:07 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "In this article, we introduce a novel method of
                 synthesizing symmetric Boolean functions with
                 reversible logic gates. In contrast to earlier
                 approaches, the proposed technique deploys a simple,
                 regular, and cascaded structure consisting of an array
                 of Peres and CNOT gates, which results in significant
                 reduction with respect to the quantum cost. However,
                 the number of circuit inputs may increase slightly when
                 such cascades are used. In order to reduce their
                 number, we next propose a postsynthesis optimization
                 phase that allows judicious reuse of circuit lines. In
                 addition to offering a cost-effective synthesis
                 methodology, the proposed reversible logic structure
                 supports elegant testability properties. With respect
                 to all single or partial missing gate faults (SMGFs and
                 PMGFs), or repeated gate faults (RGFs) in such an n
                 -input circuit module, we show that it admits a
                 universal test set of constant cardinality (=3) for any
                 value of n. Thus, considering both the cost and
                 testability issues, this approach provides a superior
                 option for synthesizing symmetric functions compared to
                 existing designs.",
  acknowledgement = ack-nhfb,
  articleno =    "34",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Wang:2016:NPM,
  author =       "Qian Wang and Yongtae Kim and Peng Li",
  title =        "Neuromorphic Processors with Memristive Synapses:
                 Synaptic Interface and Architectural Exploration",
  journal =      j-JETC,
  volume =       "12",
  number =       "4",
  pages =        "35:1--35:??",
  month =        jul,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2894756",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Thu Dec 1 09:26:07 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Due to their nonvolatile nature, excellent
                 scalability, and high density, memristive nanodevices
                 provide a promising solution for low-cost on-chip
                 storage. Integrating memristor-based synaptic crossbars
                 into digital neuromorphic processors (DNPs) may
                 facilitate efficient realization of brain-inspired
                 computing. This article investigates architectural
                 design exploration of DNPs with memristive synapses by
                 proposing two synapse readout schemes. The key design
                 tradeoffs involving different analog-to-digital
                 conversions and memory accessing styles are thoroughly
                 investigated. A novel storage strategy optimized for
                 feedforward neural networks is proposed in this work,
                 which greatly reduces the energy and area cost of the
                 memristor array and its peripherals.",
  acknowledgement = ack-nhfb,
  articleno =    "35",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Biswas:2016:IFW,
  author =       "Kalyan Biswas and Angsuman Sarkar and Chandan Kumar
                 Sarkar",
  title =        "Impact of Fin Width Scaling on {RF}\slash Analog
                 Performance of Junctionless Accumulation-Mode Bulk
                 {FinFET}",
  journal =      j-JETC,
  volume =       "12",
  number =       "4",
  pages =        "36:1--36:??",
  month =        jul,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2903143",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Thu Dec 1 09:26:07 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "In this article, the RF and analog performance of
                 junctionless accumulation-mode bulk FinFETs is analyzed
                 by employing the variation of fin width so that it can
                 be used as a high-efficiency RF integrated circuit
                 design. The RF/analog performance evaluation has been
                 carried out using the ATLAS 3D device simulator in
                 terms of evaluation of figure-of-merits metrics such as
                 transconductance (g$_m$ ), gate-to-source/drain
                 capacitances (C$_{gg}$ ), cutoff frequency (f$_T$ ),
                 and maximum frequency of oscillation (f$_{max}$ ).
                 Apart from RF/analog performance investigation, the
                 variation of ON-current to OFF-current ratio (I$_{ON}$
                 /I$_{OFF}$ ) and transconductance generation factor
                 (g$_m$ /I$_{ds}$ ) have also been carried out. From
                 this study, it is observed that smaller fin width of
                 the device improves its performance.",
  acknowledgement = ack-nhfb,
  articleno =    "36",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Chen:2016:AMS,
  author =       "Yi-Hang Chen and Jian-Yu Chen and Juinn-Dar Huang",
  title =        "Area Minimization Synthesis for Reconfigurable
                 Single-Electron Transistor Arrays with Fabrication
                 Constraints",
  journal =      j-JETC,
  volume =       "12",
  number =       "4",
  pages =        "37:1--37:??",
  month =        jul,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2906360",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Thu Dec 1 09:26:07 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Power dissipation has become a pressing issue of
                 concern in the designs of most electronic system as
                 fabrication processes enter even deeper submicron
                 regions. More specifically, leakage power plays a
                 dominant role in system power dissipation. An emerging
                 circuit design style, the reconfigurable
                 single-electron transistor (SET) array, has been
                 proposed for continuing Moore's Law due to its
                 ultra-low leakage power consumption. Recently, several
                 works have been proposed to address the issues related
                 to automated synthesis for the reconfigurable SET
                 array. Nevertheless, all of those existing approaches
                 consider mandatory fabrication constraints of SET array
                 merely in late synthesis stages. In this article, we
                 propose a synthesis algorithm, featuring input-variable
                 ordering and dynamic product term ordering, for area
                 minimization. The fabrication constraints are taken
                 into account at every synthesis stage of proposed flow
                 to guarantee better synthesis outcomes. We also develop
                 a simulated annealing-based postprocess to find a
                 proper phase assignment of each input variable for
                 further area reduction. Experimental results show that
                 our new methodology can achieve up to 29\% area
                 reduction as compared to existing state-of-the-art
                 techniques.",
  acknowledgement = ack-nhfb,
  articleno =    "37",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Kim:2016:CAP,
  author =       "Moon Seok Kim and William Cane-Wissing and Xueqing Li
                 and Jack Sampson and Suman Datta and Sumeet Kumar Gupta
                 and Vijaykrishnan Narayanan",
  title =        "Comparative Area and Parasitics Analysis in {FinFET}
                 and Heterojunction Vertical {TFET} Standard Cells",
  journal =      j-JETC,
  volume =       "12",
  number =       "4",
  pages =        "38:1--38:??",
  month =        jul,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2914790",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Thu Dec 1 09:26:07 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Vertical tunnel field-effect transistors (VTFETs) have
                 been extensively explored to overcome the scaling
                 limits and to improve on-current (I$_{ON}$) compared to
                 standard lateral device structures for the future
                 technologies. The benefits in terms of reduced
                 footprint, high I$_{ON}$ and feasibility of fabrication
                 have been demonstrated in several works. Among various
                 VTFETs, the asymmetric heterojunction vertical tunnel
                 FETs (HVTFETs) have emerged as one of the promising
                 alternatives to standard transistors for low-voltage
                 applications. However, while such device-level benefits
                 without parasitics have been widely investigated,
                 logic-gate design with parasitics and layout
                 implications are not clear. In this article, we
                 investigate and compare the layouts and parasitic
                 capacitances and resistances of HVTFETs with FinFETs.
                 Due to the vertical device structure of HVTFETs, a
                 smaller footprint is observed compared to FinFETs in
                 cells with small fan-in. However, for high fan-in
                 cells, HVTFETs exhibit area overheads due to
                 infeasibility of contact sharing in parallel and series
                 transistors. These area overheads also lead to
                 approximately 48\% higher parasitic capacitance and
                 resistance compared to FinFETs when the number of
                 parallel and series connections increases. Further, in
                 order to analyze the impact of parasitics, we modeled
                 the analytical parasitics in SPICE. The models for both
                 HVTFETs and FinFETs with parasitics were used to
                 simulate a 15-stage inverter-based ring oscillator (RO)
                 in order to compare the delay and energy. Our
                 simulation results clearly show that HVTFETs exhibit
                 less delay at a $ V_{DD} < 0.45 V$ and higher energy
                 efficiency for $ V_{DDs}$ in the range of 0.3V--0.7V,
                 albeit at the cost of 8\% performance degradation.",
  acknowledgement = ack-nhfb,
  articleno =    "38",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Ahsan:2016:DMQ,
  author =       "Muhammad Ahsan and Rodney {Van Meter} and Jungsang
                 Kim",
  title =        "Designing a Million-Qubit Quantum Computer Using a
                 Resource Performance Simulator",
  journal =      j-JETC,
  volume =       "12",
  number =       "4",
  pages =        "39:1--39:??",
  month =        jul,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2830570",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Thu Dec 1 09:26:07 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "The optimal design of a fault-tolerant quantum
                 computer involves finding an appropriate balance
                 between the burden of large-scale integration of noisy
                 components and the load of improving the reliability of
                 hardware technology. This balance can be evaluated by
                 quantitatively modeling the execution of quantum logic
                 operations on a realistic quantum hardware containing
                 limited computational resources. In this work, we
                 report a complete performance simulation software tool
                 capable of (1) searching the hardware design space by
                 varying resource architecture and technology
                 parameters, (2) synthesizing and scheduling a
                 fault-tolerant quantum algorithm within the hardware
                 constraints, (3) quantifying the performance metrics
                 such as the execution time and the failure probability
                 of the algorithm, and (4) analyzing the breakdown of
                 these metrics to highlight the performance bottlenecks
                 and visualizing resource utilization to evaluate the
                 adequacy of the chosen design. Using this tool, we
                 investigate a vast design space for implementing key
                 building blocks of Shor's algorithm to factor a
                 1,024-bit number with a baseline budget of 1.5 million
                 qubits. We show that a trapped-ion quantum computer
                 designed with twice as many qubits and one-tenth of the
                 baseline infidelity of the communication channel can
                 factor a 2,048-bit integer in less than 5 months.",
  acknowledgement = ack-nhfb,
  articleno =    "39",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Arabzadeh:2016:QLS,
  author =       "Mona Arabzadeh and Mahboobeh Houshmand and Mehdi
                 Sedighi and Morteza Saheb Zamani",
  title =        "Quantum-Logic Synthesis of {Hermitian} Gates",
  journal =      j-JETC,
  volume =       "12",
  number =       "4",
  pages =        "40:1--40:??",
  month =        jul,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2794263",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Thu Dec 1 09:26:07 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "In this article, the problem of synthesizing a general
                 Hermitian quantum gate into a set of primary quantum
                 gates is addressed. To this end, an extended version of
                 the Jacobi approach for calculating the eigenvalues of
                 Hermitian matrices in linear algebra is considered as
                 the basis of the proposed synthesis method. The quantum
                 circuit synthesis method derived from the Jacobi
                 approach and its optimization challenges are described.
                 It is shown that the proposed method results in
                 multiple-control rotation gates around the y axis,
                 multiple-control phase shift gates, multiple-control
                 NOT gates, and a middle diagonal Hermitian matrix,
                 which can be synthesized to multiple-control Pauli Z
                 gates. Using the proposed approach, it is shown how
                 multiple-control U gates, where U is a single-qubit
                 Hermitian quantum gate, can be implemented using a
                 linear number of elementary gates in terms of circuit
                 lines with the aid of one auxiliary qubit in an
                 arbitrary state.",
  acknowledgement = ack-nhfb,
  articleno =    "40",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Soeken:2016:ELB,
  author =       "Mathias Soeken and Robert Wille and Oliver Keszocze
                 and D. Michael Miller and Rolf Drechsler",
  title =        "Embedding of Large {Boolean} Functions for Reversible
                 Logic",
  journal =      j-JETC,
  volume =       "12",
  number =       "4",
  pages =        "41:1--41:??",
  month =        jul,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2786982",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Thu Dec 1 09:26:07 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Reversible logic represents the basis for many
                 emerging technologies and has recently been intensively
                 studied. However, most of the Boolean functions of
                 practical interest are irreversible and must be
                 embedded into a reversible function before they can be
                 synthesized. Thus far, an optimal embedding is
                 guaranteed only for small functions, whereas a
                 significant overhead results when large functions are
                 considered. We study this issue in this article. We
                 prove that determining an optimal embedding is
                 coNP-hard already for restricted cases. Then, we
                 propose heuristic and exact methods for determining
                 both the number of additional lines and a corresponding
                 embedding. For the approaches, we considered sum of
                 products and binary decision diagrams as function
                 representations. Experimental evaluations show the
                 applicability of the approaches for large functions.
                 Consequently, the reversible embedding of large
                 functions is enabled as a precursor to subsequent
                 synthesis.",
  acknowledgement = ack-nhfb,
  articleno =    "41",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Tang:2016:DPM,
  author =       "Aoxiang Tang and Xun Gao and Lung-Yen Chen and Niraj
                 K. Jha",
  title =        "Delay\slash Power Modeling and Optimization of
                 {FinFET} Circuit Modules under {PVT} Variations:
                 Observing the Trends between the 22nm and 14nm
                 Technology Nodes",
  journal =      j-JETC,
  volume =       "12",
  number =       "4",
  pages =        "42:1--42:??",
  month =        jul,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2795231",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Thu Dec 1 09:26:07 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "The semiconductor industry has moved to FinFETs
                 because of their superior ability to mitigate
                 short-channel effects relative to CMOS. Thus, good
                 FinFET delay and power models are urgently needed to
                 facilitate FinFET IC design at the upcoming technology
                 nodes. Another urgent problem that needs to be
                 addressed with continued technology scaling is how to
                 analyze circuit performance and power consumption under
                 process, voltage, and temperature (PVT) variations.
                 Such variations arise due to limitations of lithography
                 that lead to variations in the physical dimensions of
                 the device or due to environmental variations. In this
                 article, we propose a delay/power modeling framework
                 for analysis of FinFET logic circuits under PVT
                 variations. We present models for FinFET logic gates
                 and three FinFET SRAM cells. We use GenFin, which is a
                 genetic algorithm based statistical circuit-level
                 delay/power optimizer, to produce the models for
                 functional units (FUs) employed in a processor. We
                 compare the impact of PVT variations at the 22nm and
                 14nm FinFET technology nodes. We evaluate cache
                 performance for various cache capacities and
                 temperatures as well as that of FUs. Our device
                 simulation results show that the $ 3 \sigma / \mu $
                 spread for 14nm circuits is, on average, 38.5\% higher
                 in dynamic power and 21.4\% higher in logarithm of
                 leakage power relative to 22nm FinFET circuits.
                 However, the delay spread depends on the circuit.",
  acknowledgement = ack-nhfb,
  articleno =    "42",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Chaudhuri:2016:ULL,
  author =       "Sourindra M. Chaudhuri and Niraj K. Jha",
  title =        "Ultra-Low-Leakage and High-Performance Logic Circuit
                 Design Using Multiparameter Asymmetric {FinFETs}",
  journal =      j-JETC,
  volume =       "12",
  number =       "4",
  pages =        "43:1--43:??",
  month =        jul,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2832913",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Thu Dec 1 09:26:07 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Recently, multigate field-effect transistors have
                 started replacing traditional planar MOSFETs to keep
                 pace with Moore's Law in deep submicron technology.
                 Among different multigate transistors, FinFETs have
                 become the preferred choice of the semiconductor
                 industry owing to low fabrication cost, superior
                 performance, lower leakage, and design flexibility. The
                 back and front gates of a FinFET can either be shorted
                 or remain independent, leading to two modes of
                 operation: Shorted-Gate (SG) and Independent-Gate (IG).
                 For a given mode of operation, the physical parameters
                 of the FinFET can either be symmetric or asymmetric in
                 nature. In this article, for the first time, we analyze
                 multiparameter asymmetric SG FinFETs and illustrate
                 their potential for implementing logic gates and
                 circuits that are both ultra-low-leakage and
                 high-performance simultaneously. We restrict this work
                 to SG devices because IG FinFETs (symmetric/asymmetric)
                 suffer from severely degraded on-current, which makes
                 them unattractive for high-performance designs. We
                 first compare head-to-head all viable single- and
                 multiparameter symmetric/asymmetric SG FinFETs. Among
                 all such FinFETs, the traditional SG (which are
                 symmetric in nature), Asymmetric Workfunction
                 Shorted-Gate (AWSG), and Asymmetric
                 Workfunction-Underlap Shorted-Gate (AWUSG) FinFETs show
                 the most promise. We characterize these devices under
                 process variations in gate length $ (L_G) $, fin
                 thickness $ (T_{SI}) $, gate-oxide thickness $ (T_{OX})
                 $, gate underlap $ (L_{UN}) $, and gate-workfunction $
                 (\Phi_G) $ as well as supply voltage $ (V_{DD}) $
                 variations, followed by a gate-level leakage/delay
                 analysis at different temperatures. Although AWSG
                 FinFETs consume very low leakage power, they do suffer
                 from performance degradation relative to SG FinFETs.
                 Similarly, our study reveals that no other
                 single-parameter asymmetric FinFET provides a good
                 combination of low-power and high-performance design.
                 We show that gates/circuits based on AWUSG FinFETs are
                 faster, yet consume much less leakage power and less
                 area than gates/circuits based on traditional SG
                 FinFETs. We observe 53.4\% (30.2\%) maximum (average)
                 reduction in total power at temperature $ T = 348 $K
                 while meeting the same delay constraint, with 14.2\%
                 (13.5\%) reduction in area for AWUSG circuits relative
                 to SG circuits. At $ T = 373 $K, we see 68.6\% (46.9\%)
                 maximum (average) reduction in total power.",
  acknowledgement = ack-nhfb,
  articleno =    "43",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Beuningen:2016:PPR,
  author =       "Anja {Von Beuningen} and Luca Ramini and Davide
                 Bertozzi and Ulf Schlichtmann",
  title =        "{PROTON+}: a Placement and Routing Tool for {$3$D}
                 Optical Networks-on-Chip with a Single Optical Layer",
  journal =      j-JETC,
  volume =       "12",
  number =       "4",
  pages =        "44:1--44:??",
  month =        jul,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2830716",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Thu Dec 1 09:26:07 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Optical Networks-on-Chip (ONoCs) are a promising
                 technology to overcome the bottleneck of low bandwidth
                 of electronic Networks-on-Chip. Recent research
                 discusses power and performance benefits of ONoCs based
                 on their system-level design, while layout effects are
                 typically overlooked. As a consequence, laser power
                 requirements are inaccurately computed from the logic
                 scheme but do not consider the layout. In this article,
                 we propose PROTON+, a fast tool for placement and
                 routing of 3D ONoCs minimizing the total laser power.
                 Using our tool, the required laser power of the system
                 can be decreased by up to 94\% compared to a
                 state-of-the-art manually designed layout. In addition,
                 with the help of our tool, we study the physical design
                 space of ONoC topologies. For this purpose, topology
                 synthesis methods (e.g., global connectivity and
                 network partitioning) as well as different objective
                 function weights are analyzed in order to minimize the
                 maximum insertion loss and ultimately the system's
                 laser power consumption. For the first time, we study
                 optimal positions of memory controllers. A comparison
                 of our algorithm to a state-of-the-art placer for
                 electronic circuits shows the need for a different set
                 of tools custom-tailored for the particular
                 requirements of optical interconnects.",
  acknowledgement = ack-nhfb,
  articleno =    "44",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Dehghani:2016:NAO,
  author =       "Abbas Dehghani and Kamal Jamshidi",
  title =        "A Novel Approach to Optimize Fault-Tolerant Hybrid
                 Wireless Network-on-Chip Architectures",
  journal =      j-JETC,
  volume =       "12",
  number =       "4",
  pages =        "45:1--45:??",
  month =        jul,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2814572",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Thu Dec 1 09:26:07 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Wireless Network-on-Chip (WNoC) architectures have
                 emerged as a promising interconnection infrastructure
                 to address the performance limitations of traditional
                 wire-based multihop NOCs. Nevertheless, the WNoC
                 systems encounter high failure rates due to problems
                 pertaining to integration and manufacturing of wireless
                 interconnection in nano-domain technology. As a result,
                 the permanent failures may lead to the formation of any
                 shape of faulty regions in the interconnection network,
                 which can break down the whole system. This issue is
                 not investigated in previous studies on WNoC
                 architectures. Our solution advocates the adoption of
                 communication structures with both node and link on
                 disjoint paths. On the other hand, the imposed costs of
                 WNoC design must be reasonable. Hence, a novel approach
                 to design an optimized fault-tolerant hybrid
                 hierarchical WNoC architecture for enhancing
                 performance as well as minimizing system costs is
                 proposed. The experimental results indicate that the
                 robustness of this newly proposed design is
                 significantly enhanced in comparison with its the
                 fault-tolerant wire-based counterparts in the presence
                 of various faulty regions under both synthetic and
                 application-specific traffic patterns.",
  acknowledgement = ack-nhfb,
  articleno =    "45",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Mittal:2016:SAT,
  author =       "Sparsh Mittal",
  title =        "A Survey of Architectural Techniques for
                 Near-Threshold Computing",
  journal =      j-JETC,
  volume =       "12",
  number =       "4",
  pages =        "46:1--46:??",
  month =        jul,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2821510",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Thu Dec 1 09:26:07 MST 2016",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Energy efficiency has now become the primary obstacle
                 in scaling the performance of all classes of computing
                 systems. Low-voltage computing, specifically,
                 near-threshold voltage computing (NTC), which involves
                 operating the transistor very close to and yet above
                 its threshold voltage, holds the promise of providing
                 many-fold improvement in energy efficiency. However,
                 use of NTC also presents several challenges such as
                 increased parametric variation, failure rate, and
                 performance loss. This article surveys several recent
                 techniques that aim to offset these challenges for
                 fully leveraging the potential of NTC. By classifying
                 these techniques along several dimensions, we also
                 highlight their similarities and differences. It is
                 hoped that this article will provide insights into
                 state-of-the-art NTC techniques to researchers and
                 system designers and inspire further research in this
                 field.",
  acknowledgement = ack-nhfb,
  articleno =    "46",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Sinanoglu:2016:GES,
  author =       "Ozgur Sinanoglu and Ramesh Karri",
  title =        "Guest Editorial Special Issue on Secure and
                 Trustworthy Computing",
  journal =      j-JETC,
  volume =       "13",
  number =       "1",
  pages =        "1:1--1:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2898433",
  ISSN =         "1550-4832",
  bibdate =      "Sat Apr 8 10:16:07 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  acknowledgement = ack-nhfb,
  articleno =    "1",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Das:2016:MPU,
  author =       "Jayita Das and Kevin Scott and Sanjukta Bhanja",
  title =        "{MRAM PUF}: Using Geometric and Resistive Variations
                 in {MRAM} Cells",
  journal =      j-JETC,
  volume =       "13",
  number =       "1",
  pages =        "2:1--2:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2854154",
  ISSN =         "1550-4832",
  bibdate =      "Sat Apr 8 10:16:07 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "In this work, we have studied two novel techniques to
                 enhance the performance of existing geometry-based
                 magnetoresistive RAM physically unclonable function
                 (MRAM PUF). Geometry-based MRAM PUFs rely only on
                 geometric variations in MRAM cells that generate
                 preferred ground state in cells and form the basis of
                 digital signature generation. Here we study two novel
                 ways to improve the performance of the geometry-based
                 PUF signature. First, we study how the choice between
                 specific geometries can enhance the reliability of the
                 digital signature. Using fabrications and simulations,
                 we study how the rectangular shape in the PUF cells is
                 more susceptible to lithography-based geometric
                 variations than the elliptical shape of the same aspect
                 ratio. The choice of rectangular over elliptical masks
                 in the lithography process can therefore improve the
                 reliability of the digital signature from PUF. Second,
                 we present a MRAM PUF architecture and study how
                 resistances in MRAM cells can be used to generate
                 analog voltage output that are easier to detect if
                 probed by an adversary. In the new PUF architecture, we
                 have the choice between selection of rows and columns
                 to generate unique and hard-to-predict analog voltage
                 outputs. For a 64-bit response, the analog voltage
                 output can range between 20 and 500 mV, making it tough
                 for an adversary to guess over this wide range of
                 voltages. This work ends with a discussion on the
                 threat resilience ability of the new improved MRAM PUF
                 to attacks from probing-, tampering-, reuse-, and
                 simulation-based models.",
  acknowledgement = ack-nhfb,
  articleno =    "2",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Bi:2016:ETB,
  author =       "Yu Bi and Kaveh Shamsi and Jiann-Shiun Yuan and
                 Pierre-Emmanuel Gaillardon and Giovanni {De Micheli}
                 and Xunzhao Yin and X. Sharon Hu and Michael Niemier
                 and Yier Jin",
  title =        "Emerging Technology-Based Design of Primitives for
                 Hardware Security",
  journal =      j-JETC,
  volume =       "13",
  number =       "1",
  pages =        "3:1--3:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2816818",
  ISSN =         "1550-4832",
  bibdate =      "Sat Apr 8 10:16:07 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Hardware security concerns such as intellectual
                 property (IP) piracy and hardware Trojans have
                 triggered research into circuit protection and
                 malicious logic detection from various design
                 perspectives. In this article, emerging technologies
                 are investigated by leveraging their unique properties
                 for applications in the hardware security domain.
                 Security, for the first time, will be treated as one
                 design metric for emerging nano-architecture. Five
                 example circuit structures including camouflaging
                 gates, polymorphic gates, current/voltage-based circuit
                 protectors, and current-based XOR logic are designed to
                 show the high efficiency of silicon nanowire FETs and
                 graphene SymFET in applications such as circuit
                 protection and IP piracy prevention. Simulation results
                 indicate that highly efficient and secure circuit
                 structures can be achieved via the use of non-CMOS
                 devices.",
  acknowledgement = ack-nhfb,
  articleno =    "3",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Iyengar:2016:SPS,
  author =       "Anirudh Iyengar and Swaroop Ghosh and Kenneth Ramclam
                 and Jae-Won Jang and Cheng-Wei Lin",
  title =        "Spintronic {PUFs} for Security, Trust, and
                 Authentication",
  journal =      j-JETC,
  volume =       "13",
  number =       "1",
  pages =        "4:1--4:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2809781",
  ISSN =         "1550-4832",
  bibdate =      "Sat Apr 8 10:16:07 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "We propose spintronic physically unclonable functions
                 (PUFs) to exploit security-specific properties of
                 domain wall memory (DWM) for security, trust, and
                 authentication. We note that the nonlinear dynamics of
                 domain walls (DWs) in the physical magnetic system is
                 an untapped source of entropy that can be leveraged for
                 hardware security. The spatial and temporal randomness
                 in the physical system is employed in conjunction with
                 microscopic and macroscopic properties such as
                 stochastic DW motion, stochastic pinning/depinning, and
                 serial access to realize novel relay-PUF and memory-PUF
                 designs. The proposed PUFs show promising results ($
                 \approx $50\% interdie Hamming distance (HD) and 10\%
                 to 20\% intradie HD) in terms of randomness, stability,
                 and resistance to attacks. We have investigated
                 noninvasive attacks, such as machine learning and
                 magnetic field attack, and have assessed the PUFs
                 resilience.",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Vatajelu:2016:SMB,
  author =       "Elena Ioana Vatajelu and Giorgio {Di Natale} and Mario
                 Barbareschi and Lionel Torres and Marco Indaco and
                 Paolo Prinetto",
  title =        "{STT--MRAM}-Based {PUF} Architecture Exploiting
                 Magnetic Tunnel Junction Fabrication-Induced
                 Variability",
  journal =      j-JETC,
  volume =       "13",
  number =       "1",
  pages =        "5:1--5:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2790302",
  ISSN =         "1550-4832",
  bibdate =      "Sat Apr 8 10:16:07 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Physically Unclonable Functions (PUFs) are emerging
                 cryptographic primitives used to implement low-cost
                 device authentication and secure secret key generation.
                 Weak PUF's (i.e., devices able to generate a single
                 signature or to deal with a limited number of
                 challenges) are widely discussed in literature. One of
                 the most investigated solutions today is based on
                 SRAMs. However, the rapid development of low-power,
                 high-density, high-performance SoCs has pushed the
                 embedded memories to their limits and opened the field
                 to the development of emerging memory technologies. The
                 Spin-Transfer-Torque Magnetic Random Access Memory
                 (STT-MRAM) has emerged as a promising choice for
                 embedded memories due to its reduced read/write latency
                 and high CMOS integration capability. In this article,
                 we propose an innovative PUF design based on STT-MRAM
                 memory. We exploit the high variability affecting the
                 electrical resistance of the Magnetic Tunnel Junction
                 (MTJ) device in anti-parallel magnetization. We will
                 demonstrate that the proposed solution is robust,
                 unclonable, and unpredictable.",
  acknowledgement = ack-nhfb,
  articleno =    "5",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Quadir:2016:SCS,
  author =       "Shahed E. Quadir and Junlin Chen and Domenic Forte and
                 Navid Asadizanjani and Sina Shahbazmohamadi and Lei
                 Wang and John Chandy and Mark Tehranipoor",
  title =        "A Survey on Chip to System Reverse Engineering",
  journal =      j-JETC,
  volume =       "13",
  number =       "1",
  pages =        "6:1--6:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2755563",
  ISSN =         "1550-4832",
  bibdate =      "Sat Apr 8 10:16:07 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "The reverse engineering (RE) of electronic chips and
                 systems can be used with honest and dishonest
                 intentions. To inhibit RE for those with dishonest
                 intentions (e.g., piracy and counterfeiting), it is
                 important that the community is aware of the
                 state-of-the-art capabilities available to attackers
                 today. In this article, we will be presenting a survey
                 of RE and anti-RE techniques on the chip, board, and
                 system levels. We also highlight the current challenges
                 and limitations of anti-RE and the research needed to
                 overcome them. This survey should be of interest to
                 both governmental and industrial bodies whose critical
                 systems and intellectual property (IP) require
                 protection from foreign enemies and counterfeiters who
                 possess advanced RE capabilities.",
  acknowledgement = ack-nhfb,
  articleno =    "6",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Castro:2016:FVB,
  author =       "Stephan {De Castro} and Jean-Max Dutertre and Bruno
                 Rouzeyre and Giorgio {Di Natale} and Marie-Lise
                 Flottes",
  title =        "Frontside Versus Backside Laser Injection: a
                 Comparative Study",
  journal =      j-JETC,
  volume =       "13",
  number =       "1",
  pages =        "7:1--7:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2845999",
  ISSN =         "1550-4832",
  bibdate =      "Sat Apr 8 10:16:07 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "The development of cryptographic devices was followed
                 by the development of so-called implementation attacks,
                 which are intended to retrieve secret information
                 exploiting the hardware itself. Among these attacks,
                 fault attacks can be used to disturb the circuit while
                 performing a computation to retrieve the secret. Among
                 possible means of injecting a fault, laser beams have
                 proven to be accurate and powerful. The laser can be
                 used to illuminate the circuit either from its
                 frontside (i.e., where metal interconnections are first
                 encountered) or from the backside (i.e., through the
                 substrate). Historically, frontside injection was
                 preferred because it does not require the die to be
                 thinned. Nevertheless, due to the increasing
                 integration of metal layers in modern technologies,
                 frontside injections do not allow targeting of any
                 desired location. Indeed, metal lines act as mirrors,
                 and they reflect and refract most of the energy
                 provided by the laser beam. Conversely, backside
                 injections, although more difficult to set up, allow an
                 increase of the resolution of the target location and
                 remove the drawbacks of the frontside technique. This
                 article compares experimental results from frontside
                 and backside fault injections. The effectiveness of the
                 two techniques is measured in terms of exploitable
                 errors on an AES circuit (i.e., errors that can be used
                 to extract the value of the secret key used during the
                 encryption process). We will show, conversely to what
                 is generally assumed, that frontside injection can
                 provide even better results compared to backside
                 injection, especially for low-cost beams with a large
                 laser spot.",
  acknowledgement = ack-nhfb,
  articleno =    "7",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Barenghi:2016:FBS,
  author =       "Alessandro Barenghi and Guido M. Bertoni and Luca
                 Breveglieri and Gerardo Pelosi and Stefano Sanfilippo
                 and Ruggero Susella",
  title =        "A Fault-Based Secret Key Retrieval Method for {ECDSA}:
                 Analysis and Countermeasure",
  journal =      j-JETC,
  volume =       "13",
  number =       "1",
  pages =        "8:1--8:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2767132",
  ISSN =         "1550-4832",
  bibdate =      "Sat Apr 8 10:16:07 MDT 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jetc/;
                 https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Elliptic curve cryptosystems proved to be well suited
                 for securing systems with constrained resources like
                 embedded and portable devices. In a fault-based attack,
                 errors are induced during the computation of a
                 cryptographic primitive, and the results are collected
                 to derive information about the secret key safely
                 stored in the device. We introduce a novel attack
                 methodology to recover the secret key employed in
                 implementations of the Elliptic Curve Digital Signature
                 Algorithm. Our attack exploits the information leakage
                 induced when altering the execution of the modular
                 arithmetic operations used in the signature primitive
                 and does not rely on the underlying elliptic curve
                 mathematical structure, thus being applicable to all
                 standardized curves. We provide both a validation of
                 the feasibility of the attack, even employing common
                 off-the-shelf hardware to perform the required
                 computations, and a low-cost countermeasure to
                 counteract it.",
  acknowledgement = ack-nhfb,
  articleno =    "8",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Lao:2016:BFD,
  author =       "Yingjie Lao and Qianying Tang and Chris H. Kim and
                 Keshab K. Parhi",
  title =        "Beat Frequency Detector-Based High-Speed True Random
                 Number Generators: Statistical Modeling and Analysis",
  journal =      j-JETC,
  volume =       "13",
  number =       "1",
  pages =        "9:1--9:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2866574",
  ISSN =         "1550-4832",
  bibdate =      "Sat Apr 8 10:16:07 MDT 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jetc/;
                 https://www.math.utah.edu/pub/tex/bib/jetc.bib;
                 https://www.math.utah.edu/pub/tex/bib/prng.bib",
  abstract =     "True random number generators (TRNGs) are crucial
                 components for the security of cryptographic systems.
                 In contrast to pseudo--random number generators
                 (PRNGs), TRNGs provide higher security by extracting
                 randomness from physical phenomena. To evaluate a TRNG,
                 statistical properties of the circuit model and raw
                 bitstream should be studied. In this article, a model
                 for the beat frequency detector--based high-speed TRNG
                 (BFD-TRNG) is proposed. The parameters of the model are
                 extracted from the experimental data of a test chip. A
                 statistical analysis of the proposed model is carried
                 out to derive mean and variance of the counter values
                 of the TRNG. Our statistical analysis results show that
                 mean of the counter values is inversely proportional to
                 the frequency difference of the two ring oscillators
                 (ROSCs), whereas the dynamic range of the counter
                 values increases linearly with standard deviation of
                 environmental noise and decreases with increase of the
                 frequency difference. Without the measurements from the
                 test data, a model cannot be created; similarly,
                 without a model, performance of a TRNG cannot be
                 predicted. The key contribution of the proposed
                 approach lies in fitting the model to measured data and
                 the ability to use the model to predict performance of
                 BFD-TRNGs that have not been fabricated. Several novel
                 alternate BFD-TRNG architectures are also proposed;
                 these include parallel BFD, cascade BFD, and
                 parallel-cascade BFD. These TRNGs are analyzed using
                 the proposed model, and it is shown that the parallel
                 BFD structure requires less area per bit, whereas the
                 cascade BFD structure has a larger dynamic range while
                 maintaining the same mean of the counter values as the
                 original BFD-TRNG. It is shown that 3.25 M and 4 M
                 random bits can be obtained per counter value from
                 parallel BFD and parallel-cascade BFD, respectively,
                 where M counter values are computed in parallel.
                 Furthermore, the statistical analysis results
                 illustrate that BFD-TRNGs have better randomness and
                 less cost per bit than other existing ROSC-TRNG
                 designs. For example, it is shown that BFD-TRNGs
                 accumulate 150\% more jitter than the original
                 two-oscillator TRNG and that parallel BFD-TRNGs require
                 one-third power and one-half area for same number of
                 random bits for a specified period.",
  acknowledgement = ack-nhfb,
  articleno =    "9",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Kulkarni:2016:RTA,
  author =       "Amey Kulkarni and Youngok Pino and Matthew French and
                 Tinoosh Mohsenin",
  title =        "Real-Time Anomaly Detection Framework for Many-Core
                 Router through Machine-Learning Techniques",
  journal =      j-JETC,
  volume =       "13",
  number =       "1",
  pages =        "10:1--10:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2827699",
  ISSN =         "1550-4832",
  bibdate =      "Sat Apr 8 10:16:07 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "In this article, we propose a real-time anomaly
                 detection framework for an NoC-based many-core
                 architecture. We assume that processing cores and
                 memories are safe and anomaly is included through a
                 communication medium (i.e., router). The article
                 targets three different attacks, namely, traffic
                 diversion, route looping, and core address spoofing
                 attacks. The attacks are detected by using
                 machine-learning techniques. Comprehensive analysis on
                 machine-learning algorithms suggests that Support
                 Vector Machine (SVM) and K-Nearest Neighbor (K-NN) have
                 better attack detection efficiency. It has been
                 observed that both algorithms have accuracy in the
                 range of 94\% to 97\%. Additional hardware complexity
                 analysis advocates SVM to be implemented on hardware.
                 To test the framework, we implement a condition-based
                 attack insertion module; attacks are performed intra-
                 and intercluster. The proposed real-time anomaly
                 detection framework is fully placed and routed on
                 Xilinx Virtex-7 FPGA. Postplace and -route
                 implementation results show that SVM has 12\% to 2\%
                 area overhead and 3\% to 1\% power overhead for the
                 quad-core and 16-core implementation, respectively. It
                 is also observed that it takes 25\% to 18\% of the
                 total execution time to detect an anomaly in
                 transferred packets for quad-core and 16-core,
                 respectively. The proposed framework achieves 65\%
                 reduction in area overhead and is 3 times faster
                 compared to previous published work.",
  acknowledgement = ack-nhfb,
  articleno =    "10",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Deb:2016:GVS,
  author =       "Arighna Deb and Robert Wille and Oliver Kesz{\"o}cze
                 and Stefan Hillmich and Rolf Drechsler",
  title =        "Gates vs. Splitters: Contradictory Optimization
                 Objectives in the Synthesis of Optical Circuits",
  journal =      j-JETC,
  volume =       "13",
  number =       "1",
  pages =        "11:1--11:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2904445",
  ISSN =         "1550-4832",
  bibdate =      "Sat Apr 8 10:16:07 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Optical circuits are considered a promising emerging
                 technology for applications in ultra-high-speed
                 networks or interconnects. However, the development of
                 (automatic) synthesis approaches for such circuits is
                 still in its infancy. Although first generic and
                 automatic synthesis approaches have been proposed, no
                 clear understanding exists yet on how to keep the costs
                 of the resulting circuits as small as possible. In the
                 domain of optical circuits, this is particularly
                 interesting for the number of gates and the effect of
                 so-called splitters to the signal strength. In this
                 work, we investigate this relation by considering a
                 variety of (existing as well as proposed) synthesis
                 approaches for optical circuits. Our investigations
                 show that reducing the number of gates and reducing the
                 number of splitters are contradictory optimization
                 objectives. Furthermore, the performance of synthesis
                 guided with respect to gate efficiency as well as
                 synthesis guided with respect to splitter freeness is
                 evaluated and an overhead factor between the
                 contradictory metrics is experimentally determined.",
  acknowledgement = ack-nhfb,
  articleno =    "11",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Todri-Sanial:2017:GES,
  author =       "Aida Todri-Sanial and Saraju P. Mohanty and Mariane
                 Comte and Marc Belleville",
  title =        "Guest Editorial: Special Issue on Nanoelectronic
                 Circuit and System Design Methods for the Mobile
                 Computing Era",
  journal =      j-JETC,
  volume =       "13",
  number =       "2",
  pages =        "12:1--12:??",
  month =        mar,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3003370",
  ISSN =         "1550-4832",
  bibdate =      "Sat Apr 8 10:16:07 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  acknowledgement = ack-nhfb,
  articleno =    "12",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Sartor:2017:EIH,
  author =       "Anderson L. Sartor and Arthur F. Lorenzon and Luigi
                 Carro and Fernanda Kastensmidt and Stephan Wong and
                 Antonio C. S. Beck",
  title =        "Exploiting Idle Hardware to Provide Low Overhead Fault
                 Tolerance for {VLIW} Processors",
  journal =      j-JETC,
  volume =       "13",
  number =       "2",
  pages =        "13:1--13:??",
  month =        mar,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3001935",
  ISSN =         "1550-4832",
  bibdate =      "Sat Apr 8 10:16:07 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Because of technology scaling, the soft error rate has
                 been increasing in digital circuits, which affects
                 system reliability. Therefore, modern processors,
                 including VLIW architectures, must have means to
                 mitigate such effects to guarantee reliable computing.
                 In this scenario, our work proposes three low overhead
                 fault tolerance approaches based on instruction
                 duplication with zero latency detection, which uses a
                 rollback mechanism to correct soft errors in the
                 pipelanes of a configurable VLIW processor. The first
                 uses idle issue slots within a period of time to
                 execute extra instructions considering distinct
                 application phases. The second works at a finer grain,
                 adaptively exploiting idle functional units at
                 run-time. However, some applications present high
                 instruction-level parallelism (ILP), so the ability to
                 provide fault tolerance is reduced: less functional
                 units will be idle, decreasing the number of potential
                 duplicated instructions. The third approach attacks
                 this issue by dynamically reducing ILP according to a
                 configurable threshold, increasing fault tolerance at
                 the cost of performance. While the first two approaches
                 achieve significant fault coverage with minimal area
                 and power overhead for applications with low ILP, the
                 latter improves fault tolerance with low performance
                 degradation. All approaches are evaluated considering
                 area, performance, power dissipation, and error
                 coverage.",
  acknowledgement = ack-nhfb,
  articleno =    "13",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Fang:2017:SPM,
  author =       "Yan Fang and Victor V. Yashin and Brandon B. Jennings
                 and Donald M. Chiarulli and Steven P. Levitan",
  title =        "A Simplified Phase Model for Simulation of
                 Oscillator-Based Computing Systems",
  journal =      j-JETC,
  volume =       "13",
  number =       "2",
  pages =        "14:1--14:??",
  month =        mar,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2976743",
  ISSN =         "1550-4832",
  bibdate =      "Sat Apr 8 10:16:07 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Building oscillator-based computing systems with
                 emerging nano-device technologies has become a
                 promising solution for unconventional computing tasks
                 like computer vision and pattern recognition. However,
                 simulation and analysis of these computing systems is
                 both time and compute intensive due to the nonlinearity
                 of new devices and the complex behavior of coupled
                 oscillators. In order to speed up the simulation of
                 coupled oscillator systems, we propose a simplified
                 phase model to perform phase and frequency
                 synchronization prediction based on a synthesis of
                 earlier models. Our model can predict the
                 frequency-locking behavior with several orders of
                 magnitude speedup compared to direct evaluation,
                 enabling the effective and efficient simulation of the
                 large numbers of oscillators required for practical
                 computing systems. We demonstrate the oscillator-based
                 computing paradigm with three applications, pattern
                 matching, convolution, and image segmentation. The
                 simulation with these models are respectively sped up
                 by factors of 780, 300, and 1120 in our tests.",
  acknowledgement = ack-nhfb,
  articleno =    "14",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Singhvi:2017:FGU,
  author =       "Ajay Singhvi and Matheus T. Moreira and Ramy N. Tadros
                 and Ney L. V. Calazans and Peter A. Beerel",
  title =        "A Fine-Grain, Uniform, Energy-Efficient Delay Element
                 for $2$-Phase Bundled-Data Circuits",
  journal =      j-JETC,
  volume =       "13",
  number =       "2",
  pages =        "15:1--15:??",
  month =        mar,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2948067",
  ISSN =         "1550-4832",
  bibdate =      "Sat Apr 8 10:16:07 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Contemporary digitally controlled delay elements (DEs)
                 trade off power overheads and delay quantization error
                 (DQE). This article proposes a new programmable DE that
                 provides a balanced design that yields low power with
                 moderate DQE even under process, voltage, and
                 temperature variations. The element employs and
                 leverages the advantages offered by a 28nm fully
                 depleted silicon on insulator technology, using back
                 body biasing to add an extra dimension to its
                 programmability. To do so, a novel generic delay shift
                 block is proposed, which enables incorporating both
                 fine and coarse delays in a single DE that can be
                 easily integrated into digital systems, which is an
                 advantage over hybrid DEs that rely on analog design.",
  acknowledgement = ack-nhfb,
  articleno =    "15",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Mohammadi:2017:FTR,
  author =       "Hassan Ghasemzadeh Mohammadi and Pierre-Emmanuel
                 Gaillardon and Jian Zhang and Giovanni {De Micheli} and
                 Ernesto Sanchez and Matteo Sonza Reorda",
  title =        "A Fault-Tolerant Ripple-Carry Adder with
                 Controllable-Polarity Transistors",
  journal =      j-JETC,
  volume =       "13",
  number =       "2",
  pages =        "16:1--16:??",
  month =        mar,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2988234",
  ISSN =         "1550-4832",
  bibdate =      "Sat Apr 8 10:16:07 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "This article first explores the effects of faults on
                 circuits implemented with controllable-polarity
                 transistors. We propose a new fault model that suits
                 the characteristics of these devices, and we report the
                 results of a SPICE-based analysis of the effects of
                 faults on the behavior of some basic gates implemented
                 with them. Hence, we show that the considered devices
                 are able to intrinsically tolerate a rather high number
                 of faults. We finally exploit this property to build a
                 robust and scalable adder whose area, performance, and
                 leakage power characteristics are improved by 15\%,
                 18\%, and 12\%;, respectively, when compared to an
                 equivalent FinFET solution at 22nm technology node.",
  acknowledgement = ack-nhfb,
  articleno =    "16",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Senni:2017:NVP,
  author =       "Sophiane Senni and Lionel Torres and Gilles Sassatelli
                 and Abdoulaye Gamatie",
  title =        "Non-Volatile Processor Based on {MRAM} for
                 Ultra-Low-Power {IoT} Devices",
  journal =      j-JETC,
  volume =       "13",
  number =       "2",
  pages =        "17:1--17:??",
  month =        mar,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3001936",
  ISSN =         "1550-4832",
  bibdate =      "Sat Apr 8 10:16:07 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Over the past few years, a new era of smart connected
                 devices has emerged in the market to enable the future
                 world of the Internet of Things (IoT). A key
                 requirement for IoT applications is the power
                 consumption to allow very high autonomy in the case of
                 battery-powered systems. Depending on the application,
                 such devices will be most of the time in a low-power
                 mode (sleep mode) and will wake up only when there is a
                 task to accomplish (active mode). Emerging non-volatile
                 memory technologies are seen as a very attractive
                 solution to design ultra-low-power systems. Among these
                 technologies, magnetic random access memory is a
                 promising candidate, as it combines non-volatility,
                 high density, reasonable latency, and low leakage.
                 Integration of non-volatility as a new feature of
                 memories has the great potential to allow full data
                 retention after a complete shutdown with a fast wake-up
                 time. This article explores the benefits of having a
                 non-volatile processor to enable ultra-low-power IoT
                 devices.",
  acknowledgement = ack-nhfb,
  articleno =    "17",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Rakshit:2017:MTS,
  author =       "Joydeep Rakshit and Kartik Mohanram and Runlai Wan and
                 Kai Tak Lam and Jing Guo",
  title =        "Monolayer Transistor {SRAMs}: Toward Low-Power, Denser
                 Memory Systems",
  journal =      j-JETC,
  volume =       "13",
  number =       "2",
  pages =        "18:1--18:??",
  month =        mar,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2967613",
  ISSN =         "1550-4832",
  bibdate =      "Sat Apr 8 10:16:07 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Monolayer heterojunction FETs based on vertical
                 heterogeneous transition metal dichalcogenides
                 (TMDCFETs) and planar black phosphorus FETs (BPFETs)
                 have demonstrated excellent subthreshold swing, high
                 I$_{ON}$ I$_{OFF}$, and high scalability, making them
                 attractive candidates for post-CMOS memory design. This
                 article explores TMDCFET and BPFET SRAM design by
                 combining atomistic self-consistent device modeling
                 with SRAM circuit design and simulation. We perform
                 detailed evaluations of the TMDCFET/BPFET SRAMs at a
                 single bitcell and at SRAM array level. Our simulations
                 show that at low operating voltages, TMDCFET/BPFET
                 SRAMs exhibit significant advantages in static power,
                 dynamic read/write noise margin, and read/write delay
                 over nominal 16nm CMOS SRAMs at both bitcell and
                 array-level implementations. We also analyze the effect
                 of process variations on the performance of
                 TMDCFET/BPFET SRAMs. Our simulations demonstrate that
                 TMDCFET/BPFET SRAMs exhibit high tolerance to process
                 variations, which is desirable for low operating
                 voltages.",
  acknowledgement = ack-nhfb,
  articleno =    "18",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Wang:2017:ACP,
  author =       "Xuan Wang and Jiang Xu and Zhe Wang and Haoran Li and
                 Zhehui Wang and Peng Yang and Luan H. K. Duong and
                 Rafael K. V. Maeda and Zhifei Wang",
  title =        "Alleviate Chip Pin Constraint for Multicore Processor
                 by On\slash Off-Chip Power Delivery System Codesign",
  journal =      j-JETC,
  volume =       "13",
  number =       "2",
  pages =        "19:1--19:??",
  month =        mar,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2914791",
  ISSN =         "1550-4832",
  bibdate =      "Sat Apr 8 10:16:07 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "The number of chip pins is limited due to the cost and
                 reliability issues of sophisticated packages, and it is
                 predicted that the chip pin count will be overstretched
                 to satisfy the requirements of both power delivery and
                 memory access. The gap between the achievable pin count
                 and the demand will increase as the technology scales,
                 due to the increasing computation resources and supply
                 current. Pin reduction techniques are thus required for
                 continued computing performance growth. In this
                 article, we propose a chip pin constraint alleviation
                 strategy, through on/off-chip power delivery system
                 co-design, to effectively reduce the demand for power
                 pins. An analytical model of a power delivery system,
                 consisting of on/off-chip regulators and a power
                 delivery network, is proposed to evaluate the influence
                 of regulator design and package conduction loss. By
                 combining this model with a multi-core processor model
                 of performance and memory bandwidth requirements, we
                 characterize the entire multi-core processor system to
                 investigate the relationship between the chip pin
                 constraint and performance in multi-core processor
                 scaling and the effectiveness of our strategy.
                 Experiments show that with the conventional power
                 delivery system design, the chip pin constraint
                 severely limits the performance growth as the
                 technology scales. Using the on/off-chip power delivery
                 system co-design, our strategy achieves a significant
                 pin count reduction, for example, 31.3\% at the 8nm
                 technology node, compared to the conventional design
                 with the same chip performance, while, provided with
                 the same chip pin count, it is able to improve, by
                 35.0\%, the chip performance at 8nm compared to the
                 conventional design. For real applications of different
                 parallelism, our strategy outperforms its counterpart,
                 with a 23.7\% performance improvement on average at the
                 8nm technology node.",
  acknowledgement = ack-nhfb,
  articleno =    "19",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Pajouhi:2017:YAE,
  author =       "Zoha Pajouhi and Xuanyao Fong and Anand Raghunathan
                 and Kaushik Roy",
  title =        "Yield, Area, and Energy Optimization in {STT--MRAMs}
                 Using Failure-Aware {ECC}",
  journal =      j-JETC,
  volume =       "13",
  number =       "2",
  pages =        "20:1--20:??",
  month =        mar,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2934685",
  ISSN =         "1550-4832",
  bibdate =      "Sat Apr 8 10:16:07 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Spin-Transfer Torque MRAMs are attractive due to their
                 non-volatility, high density, and zero leakage.
                 However, STT-MRAMs suffer from poor reliability due to
                 shared read and write paths. Additionally, conflicting
                 requirements for data retention and writeability (both
                 related to the energy barrier height of the storage
                 device) makes design more challenging. Furthermore, the
                 energy barrier height depends on the geometry of the
                 storage. Any variations in the geometry of the storage
                 device lead to variations in the energy barrier height.
                 In order to address the poor reliability of STT-MRAMs,
                 usage of Error Correcting Codes (ECC) has been
                 proposed. Unlike traditional CMOS memory technologies,
                 ECC is expected to correct both soft and hard errors in
                 STT-MRAMs. To achieve acceptable yield with low write
                 power, stronger ECC is required, resulting in increased
                 number of encoded bits and degraded memory capacity. In
                 this article, we propose Failure-aware ECC (FaECC),
                 which masks permanent faults while maintaining the same
                 correction capability for soft errors without increased
                 number of encoded bits. Furthermore, we investigate the
                 impact of process variations on run-time reliability of
                 STT-MRAMs. In order to analyze the effectiveness of our
                 methodology, we developed a cross-layer simulation
                 framework that consists of device, circuit and array
                 level analysis of STT-MRAM memory arrays. Our results
                 show that using FaECC relaxes the requirements on the
                 energy barrier height, which reduces the write energy
                 and results in smaller access transistor size and
                 memory array area.",
  acknowledgement = ack-nhfb,
  articleno =    "20",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Mankalale:2017:OSC,
  author =       "Meghna G. Mankalale and Sachin S. Sapatnekar",
  title =        "Optimized Standard Cells for All-Spin Logic",
  journal =      j-JETC,
  volume =       "13",
  number =       "2",
  pages =        "21:1--21:??",
  month =        mar,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2967612",
  ISSN =         "1550-4832",
  bibdate =      "Sat Apr 8 10:16:07 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "All-Spin Logic (ASL) devices provide a promising
                 spintronics-based alternative for Boolean logic
                 implementations in the post-Complementary Metal-Oxide
                 Semiconductor (CMOS) era. In principle, any logic
                 functionality can be implemented in ASL. In practice,
                 the performance of an ASL gate is significantly
                 affected by layout choices, but such implications have
                 not been adequately explored in the past. This article
                 proposes a systematic approach for building standard
                 cells in ASL, which are a basic building block in an
                 overall design methodology for implementing large
                 ASL-based circuits. We first propose a new technique to
                 reduce the magnet count for an ASL majority gate but
                 still ensure correct functioning through layout
                 optimization methods. Building on physics-based
                 analysis, we then build a standard cell library with
                 diverse functionality and characterize the library for
                 delay, energy, and area. We perform delay-optimized
                 technology mapping on ISCAS85 benchmark circuits using
                 our library. Our approach results in circuits that are
                 12.90\% faster, consume 26.16\% less energy, and are
                 33.56\% more area efficient compared to a standard cell
                 library that does not incorporate layout-based
                 optimization techniques of our work.",
  acknowledgement = ack-nhfb,
  articleno =    "21",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Jiang:2017:SLD,
  author =       "Wei Jiang and Liang Wen and Ke Jiang and Xia Zhang and
                 Xiong Pan and Keran Zhou",
  title =        "System-Level Design to Detect Fault Injection Attacks
                 on Embedded Real-Time Applications",
  journal =      j-JETC,
  volume =       "13",
  number =       "2",
  pages =        "22:1--22:??",
  month =        mar,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2967611",
  ISSN =         "1550-4832",
  bibdate =      "Sat Apr 8 10:16:07 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Fault injection attack has been a serious threat to
                 security-critical embedded systems for a long time, yet
                 existing research ignores addressing of the problem
                 from a system-level perspective. This article presents
                 an approach to the synthesis of secure real-time
                 applications mapped on distributed embedded systems,
                 which focuses on preventing fault injection attacks of
                 the security protection on processing units. We utilize
                 symmetric cryptographic service to protect
                 confidentiality and deploy fault detection within a
                 confidential algorithm to resist fault injection
                 attacks. Several fault detection schemes are
                 identified, and their fault coverage rates and time
                 overheads are derived and measured. Our synthesis
                 approach makes efforts to determine the best fault
                 detection schemes for the encryption/decryption of
                 messages such that the overall security strength of
                 detecting a fault injection attack is maximized and the
                 deadline constraint of the real-time applications is
                 guaranteed. Due to the complexity of the problem, we
                 propose an efficient algorithm based on the fruit fly
                 optimization algorithm, and we compare it to the
                 simulated annealing approach. Extensive experiments and
                 a real-life application evaluation demonstrate the
                 superiority of our approach.",
  acknowledgement = ack-nhfb,
  articleno =    "22",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Goud:2017:AUF,
  author =       "A. Arun Goud and Rangharajan Venkatesan and Anand
                 Raghunathan and Kaushik Roy",
  title =        "Asymmetric Underlapped {FinFETs} for Near- and
                 Super-Threshold Logic at Sub-10nm Technology Nodes",
  journal =      j-JETC,
  volume =       "13",
  number =       "2",
  pages =        "23:1--23:??",
  month =        mar,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2967615",
  ISSN =         "1550-4832",
  bibdate =      "Sat Apr 8 10:16:07 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Extending double-gate FinFET scaling to sub-10nm
                 technology regime requires device-engineering
                 techniques for countering the rise of direct source to
                 drain tunneling (DSDT), edge direct tunneling (EDT) and
                 short channel effects (SCE) that degrade FinFET I-V
                 characteristics. Symmetric underlap is effective for
                 eliminating EDT, diminishing DSDT, and lowering the
                 fringe component of gate capacitance. However,
                 excessive symmetric underlap also lowers the
                 on-current, which is mainly due to thermionic emission.
                 In this work, it is demonstrated that at sub-10nm node,
                 asymmetric underlapped FinFETs with slightly longer
                 underlap toward drain side than source side are
                 superior to symmetric underlapped FinFETs due to
                 further improvement in I$_{on}$ /I$_{off}$ and
                 reduction in gate-to-drain capacitance. Using quantum
                 mechanical device simulations, FinFETs with various
                 degrees of underlap have been analyzed for improvement
                 in I-V characteristics. A FinFET model for circuit
                 simulations has been constructed that captures the
                 major sub-10nm leakage components, namely, thermionic
                 emission, DSDT, EDT, direct gate oxide tunneling and
                 its associated components. By simulating a 10-stage
                 NAND circuit and a LEON3 processor with interconnect
                 parasitics using these devices, it is shown that
                 asymmetric underlap instead of symmetric underlap in
                 sub-10nm FinFETs can offer lower energy consumption
                 with improved performance for near-threshold logic and
                 higher energy-efficiency for super-threshold logic
                 operation.",
  acknowledgement = ack-nhfb,
  articleno =    "23",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Abellan:2017:EPN,
  author =       "Jos{\'e} L. Abell{\'a}n and Chao Chen and Ajay Joshi",
  title =        "Electro-Photonic {NoC} Designs for Kilocore Systems",
  journal =      j-JETC,
  volume =       "13",
  number =       "2",
  pages =        "24:1--24:??",
  month =        mar,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2967614",
  ISSN =         "1550-4832",
  bibdate =      "Sat Apr 8 10:16:07 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "The increasing core count in manycore systems requires
                 a corresponding large Network-on-chip (NoC) bandwidth
                 to support the overlying applications. However, it is
                 not possible to provide this large bandwidth in an
                 energy-efficient manner using electrical link
                 technology. To overcome this issue, photonic link
                 technology has been proposed as a replacement. This
                 work explores the limits and opportunities for using
                 photonic links to design the NoC architecture for a
                 future Kilocore system. Three different NoC designs are
                 explored: ElecNoC, an electrical concentrated
                 two-dimensional- (2D) mesh NoC; HybNoC, an electrical
                 concentrated 2D mesh with a photonic multi-crossbar
                 NoC; and PhotoNoC, a photonic multi-bus NoC. We
                 consider both private and shared cache architectures
                 and, to leverage the large bandwidth density of
                 photonic links, we investigate the use of prefetching
                 and aggressive non-blocking caches. Our analysis using
                 contemporary Big Data workloads shows that the
                 non-blocking caches with a shared LLC can best leverage
                 the large bandwidth of the photonic links in the
                 Kilocore system. Moreover, compared to ElecNoC-based
                 and HybNoC-based Kilocore systems, a PhotoNoC-based
                 Kilocore system achieves up to 2.5$ \times $ and 1.5$
                 \times $ better performance, respectively, and can
                 support up to 2.1$ \times $ and 1.1$ \times $ higher
                 bandwidth, respectively, while dissipating comparable
                 power in the overall system.",
  acknowledgement = ack-nhfb,
  articleno =    "24",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Wang:2017:OSS,
  author =       "Yao Wang and Liang Rong and Haibo Wang and Guangjun
                 Wen",
  title =        "One-Step Sneak-Path Free Read Scheme for Resistive
                 Crossbar Memory",
  journal =      j-JETC,
  volume =       "13",
  number =       "2",
  pages =        "25:1--25:??",
  month =        mar,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3012002",
  ISSN =         "1550-4832",
  bibdate =      "Sat Apr 8 10:16:07 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "A one-step sneak-path free read scheme for resistive
                 crossbar memory is proposed in this article. During
                 read operation, it configures the crossbar array into a
                 four-terminal resistance network, which is composed of
                 the selected cell and three other resistors
                 corresponding to unselected cells that contribute to
                 the sneak-path. Two sensing voltages with equal
                 potential are applied to three terminals of the
                 network. One is for sensing the resistance of the
                 selected cell; the other is for creating zero-voltage
                 drop across one of the three resistors, which connects
                 the sneak-path to the selected cell. This effectively
                 suppresses the current injected by the sneak-path to
                 the selected cell-sensing loop. This work also proposes
                 a cost-effective data-encoding circuit that guarantees
                 that at least half of the memory cells are in a
                 high-resistance state, which further minimizes
                 sneak-path current. The impact of key design
                 parameters, such as sensing voltage, switch
                 on-resistance, and the ratio of memory cell resistances
                 in different states, as well as nonideal effects are
                 investigated. Equations for estimating the maximum
                 array size to share a single read circuit are derived.
                 The effectiveness of the proposed design has been
                 validated via circuit simulations. Impacts of the
                 word-/bit-line resistance are also analyzed.",
  acknowledgement = ack-nhfb,
  articleno =    "25",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Guler:2017:ULL,
  author =       "Abdullah Guler and Niraj K. Jha",
  title =        "Ultra-low-leakage, Robust {FinFET SRAM} Design Using
                 Multiparameter Asymmetric {FinFETs}",
  journal =      j-JETC,
  volume =       "13",
  number =       "2",
  pages =        "26:1--26:??",
  month =        mar,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2988233",
  ISSN =         "1550-4832",
  bibdate =      "Sat Apr 8 10:16:07 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Memory arrays consisting of Static Random Access
                 Memory (SRAM) cells occupy the largest area on chip and
                 are responsible for significant leakage power
                 consumption in modern microprocessors. With the
                 transition from planar Complementary
                 Metal-Oxide-Semiconductor (CMOS) technology to FinFETs,
                 FinFET SRAM design has become important. However,
                 increasing leakage power consumption of FinFETs due to
                 aggressive scaling, width quantization, read-write
                 conflict, and process variations make FinFET SRAM
                 design challenging. In this article, we show how
                 Multiparameter Asymmetric (MPA) FinFETs can be used to
                 design ultra-low-leakage and robust 6T SRAM cells. We
                 combine multiple asymmetries, namely, asymmetry in gate
                 work function, source/drain doping concentration, and
                 gate underlap, to address various SRAM design issues
                 all at once. We propose five novel MPA FinFET SRAM cell
                 designs and compare them with symmetric and
                 Single-Parameter Asymmetric (SPA) FinFET SRAM cells
                 using dc and transient metrics. We show that the
                 leakage current of MPA FinFET SRAM cells can be reduced
                 by up to 58 $ \times $ while ensuring reasonable
                 read/write stability metric values. In addition, high
                 stability metric values can be achieved with 22 $
                 \times $ leakage current reduction compared to the
                 traditional symmetric FinFET SRAM cell. There is no
                 area overhead associated with MPA FinFET SRAM cells.",
  acknowledgement = ack-nhfb,
  articleno =    "26",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Zhang:2017:SSR,
  author =       "Hang Zhang and Xuhao Chen and Nong Xiao and Lei Wang
                 and Fang Liu and Wei Chen and Zhiguang Chen",
  title =        "Shielding {STT--RAM} Based Register Files on {GPUs}
                 against Read Disturbance",
  journal =      j-JETC,
  volume =       "13",
  number =       "2",
  pages =        "27:1--27:??",
  month =        mar,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2996191",
  ISSN =         "1550-4832",
  bibdate =      "Sat Apr 8 10:16:07 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "To address the high energy consumption issue of SRAM
                 on GPUs, emerging Spin-Transfer Torque (STT-RAM) memory
                 technology has been intensively studied to build GPU
                 register files for better energy-efficiency, thanks to
                 its benefits of low leakage power, high density, and
                 good scalability. However, STT-RAM suffers from the
                 read disturbance issue, which stems from the fact that
                 the voltage difference between read current and write
                 current becomes smaller as technology scales. The read
                 disturbance leads to high error rates for read
                 operations, which cannot be effectively protected by
                 the SEC-DED ECC on large-capacity register files of
                 GPUs. Prior schemes (e.g., read-restore) to mitigate
                 the read disturbance usually incur either non-trivial
                 performance loss or excessive energy overhead, thus not
                 applicable for the GPU register file design that aims
                 to achieve both high performance and energy-efficiency.
                 To combat the read disturbance, we propose a novel
                 software-hardware co-designed solution (i.e.,
                 Red-Shield ), which consists of three optimizations to
                 overcome the limitations of the existing solutions.
                 First, we identify dead reads at compiling stage and
                 augment instructions to avoid unnecessary restores.
                 Second, we employ a small read buffer to accommodate
                 register reads with high-access locality to further
                 reduce restores. Third, we propose an adaptive restore
                 mechanism to selectively pick the suitable restore
                 scheme, according to the busy status of corresponding
                 register banks. Experimental results show that our
                 proposed design can effectively mitigate the
                 performance loss and energy overhead caused by restore
                 operations while still maintaining the reliability of
                 reads.",
  acknowledgement = ack-nhfb,
  articleno =    "27",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Biswas:2017:SAT,
  author =       "Arnab Kumar Biswas",
  title =        "Source Authentication Techniques for Network-on-Chip
                 Router Configuration Packets",
  journal =      j-JETC,
  volume =       "13",
  number =       "2",
  pages =        "28:1--28:??",
  month =        mar,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2996194",
  ISSN =         "1550-4832",
  bibdate =      "Sat Apr 8 10:16:07 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "It is known that maliciously configured
                 Network-on-Chip routers can enable an attacker to
                 launch different attacks inside a Multiprocessor
                 System-on-Chip. A source authentication mechanism for
                 router configuration packets can prevent such
                 vulnerability. This ensures that a router is configured
                 by the configuration packets sent only by a trusted
                 configuration source. Conventional method like Secure
                 Hash Algorithm-3 (SHA-3) can provide required source
                 authentication in a router but with a router area
                 overhead of 1355.25\% compared to a normal router area.
                 We propose eight source authentication mechanisms that
                 can achieve similar level of security as SHA-3 for a
                 router configuration perspective without causing
                 significant area and power increase. Moreover, the
                 processing time of our proposed techniques is 1/100th
                 of SHA-3 implementation. Most of our proposed
                 techniques use different timing channel watermarking
                 methods to transfer source authentication data to the
                 receiver router. We also propose the Individual
                 packet-based stream authentication technique and
                 combinations of this technique with timing channel
                 watermarking techniques. It is shown that, among all of
                 our proposed techniques, maximum router area increment
                 required is 28.32\% compared to a normal router.",
  acknowledgement = ack-nhfb,
  articleno =    "28",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Mittal:2017:STA,
  author =       "Sparsh Mittal",
  title =        "A Survey of Techniques for Architecting Processor
                 Components Using Domain-Wall Memory",
  journal =      j-JETC,
  volume =       "13",
  number =       "2",
  pages =        "29:1--29:??",
  month =        mar,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2994550",
  ISSN =         "1550-4832",
  bibdate =      "Sat Apr 8 10:16:07 MDT 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Recent trends of increasing core-count and
                 bandwidth/memory wall have motivated researchers to
                 explore novel memory technologies for designing
                 processor components such as cache, register file,
                 shared memory, and so on. Domain-wall memory (DWM),
                 also known as racetrack memory, is a promising emerging
                 technology due to its non-volatility and very high
                 density. However, use of DWM presents challenges due to
                 characteristics of both DWM itself (e.g., requirement
                 of shift operations, variable latency) and processor
                 components. Recently, several techniques have been
                 proposed to address these challenges. This article
                 presents a survey of architectural techniques for using
                 DWM for designing components in both CPU and GPU. We
                 discuss techniques related to performance, energy, and
                 reliability and also discuss works that compare DWM
                 with other memory technologies. We also highlight the
                 opportunities and obstacles in using DWM for designing
                 processor components. This survey is expected to spark
                 further research in this area and be useful for
                 researchers, chip designers, and computer architects.",
  acknowledgement = ack-nhfb,
  articleno =    "29",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Cao:2017:GEI,
  author =       "Yu Cao and Xin Li and Taemin Kim and Suyog Gupta",
  title =        "Guest Editors' Introduction: Hardware and Algorithms
                 for On-Chip Learning",
  journal =      j-JETC,
  volume =       "13",
  number =       "3",
  pages =        "30:1--30:??",
  month =        may,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3022193",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Tue Jul 11 17:10:31 MDT 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jetc/;
                 https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  acknowledgement = ack-nhfb,
  articleno =    "30",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Page:2017:SHA,
  author =       "Adam Page and Ali Jafari and Colin Shea and Tinoosh
                 Mohsenin",
  title =        "{SPARCNet}: a Hardware Accelerator for Efficient
                 Deployment of Sparse Convolutional Networks",
  journal =      j-JETC,
  volume =       "13",
  number =       "3",
  pages =        "31:1--31:??",
  month =        may,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3005448",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Tue Jul 11 17:10:31 MDT 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jetc/;
                 https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Deep neural networks have been shown to outperform
                 prior state-of-the-art solutions that often relied
                 heavily on hand-engineered feature extraction
                 techniques coupled with simple classification
                 algorithms. In particular, deep convolutional neural
                 networks have been shown to dominate on several popular
                 public benchmarks such as the ImageNet database.
                 Unfortunately, the benefits of deep networks have yet
                 to be fully exploited in embedded, resource-bound
                 settings that have strict power and area budgets.
                 Graphical processing unit (GPU) have been shown to
                 improve throughput and energy-efficiency over central
                 processing unit (CPU) due to their highly parallel
                 architecture yet still impose a significant power
                 burden. In a similar fashion, field programmable gate
                 array (FPGA) can be used to improve performance while
                 further allowing more fine-grained control over
                 implementation to improve efficiency. In order to
                 reduce power and area while still achieving required
                 throughput, classification-efficient network
                 architectures are required in addition to optimal
                 deployment on efficient hardware. In this work, we
                 target both of these enterprises. For the first
                 objective, we analyze simple, biologically inspired
                 reduction strategies that are applied both before and
                 after training. The central theme of the techniques is
                 the introduction of sparsification to help dissolve
                 away the dense connectivity that is often found at
                 different levels in convolutional neural networks. The
                 sparsification techniques include feature compression
                 partition, structured filter pruning, and dynamic
                 feature pruning. Additionally, we explore filter
                 factorization and filter quantization approximation
                 techniques to further reduce the complexity of
                 convolutional layers. In the second contribution, we
                 propose SPARCNet, a hardware accelerator for efficient
                 deployment of SPAR se C onvolutional NET works. The
                 accelerator looks to enable deploying networks in such
                 resource-bound settings by both exploiting efficient
                 forms of parallelism inherent in convolutional layers
                 and by exploiting the sparsification and approximation
                 techniques proposed. To demonstrate both contributions,
                 modern deep convolutional network architectures
                 containing millions of parameters are explored within
                 the context of the computer vision dataset CIFAR.
                 Utilizing the reduction techniques, we demonstrate the
                 ability to reduce computation and memory by 60\% and
                 93\% with less than 0.03\% impact on accuracy when
                 compared to the best baseline network with 93.47\%
                 accuracy. The SPARCNet accelerator with different
                 numbers of processing engines is implemented on a
                 low-power Artix-7 FPGA platform. Additionally, the same
                 networks are optimally implemented on a number of
                 embedded commercial-off-the-shelf platforms including
                 NVIDIAs CPU+GPU SoCs TK1 and TX1 and Intel Edison.
                 Compared to NVIDIAs TK1 and TX1, the FPGA-based
                 accelerator obtains 11.8 $ \times $ and 7.5 $ \times $
                 improvement in energy efficiency while maintaining a
                 classification throughput of 72 images/s. When further
                 compared to a number of recent FPGA-based accelerators,
                 SPARCNet is able to achieve up to 15 $ \times $
                 improvement in energy efficiency while consuming less
                 than 2W of total board power at 100MHz. In addition to
                 improving efficiency, the accelerator has built-in
                 support for sparsification techniques and ability to
                 perform in-place rectified linear unit (ReLU)
                 activation function, max-pooling, and batch
                 normalization.",
  acknowledgement = ack-nhfb,
  articleno =    "31",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Anwar:2017:SPD,
  author =       "Sajid Anwar and Kyuyeon Hwang and Wonyong Sung",
  title =        "Structured Pruning of Deep Convolutional Neural
                 Networks",
  journal =      j-JETC,
  volume =       "13",
  number =       "3",
  pages =        "32:1--32:??",
  month =        may,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3005348",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Tue Jul 11 17:10:31 MDT 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jetc/;
                 https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Real-time application of deep learning algorithms is
                 often hindered by high computational complexity and
                 frequent memory accesses. Network pruning is a
                 promising technique to solve this problem. However,
                 pruning usually results in irregular network
                 connections that not only demand extra representation
                 efforts but also do not fit well on parallel
                 computation. We introduce structured sparsity at
                 various scales for convolutional neural networks:
                 feature map-wise, kernel-wise, and intra-kernel strided
                 sparsity. This structured sparsity is very advantageous
                 for direct computational resource savings on embedded
                 computers, in parallel computing environments, and in
                 hardware-based systems. To decide the importance of
                 network connections and paths, the proposed method uses
                 a particle filtering approach. The importance weight of
                 each particle is assigned by assessing the
                 misclassification rate with a corresponding
                 connectivity pattern. The pruned network is retrained
                 to compensate for the losses due to pruning. While
                 implementing convolutions as matrix products, we
                 particularly show that intra-kernel strided sparsity
                 with a simple constraint can significantly reduce the
                 size of the kernel and feature map tensors. The
                 proposed work shows that when pruning granularities are
                 applied in combination, we can prune the CIFAR-10
                 network by more than 70\% with less than a 1\% loss in
                 accuracy.",
  acknowledgement = ack-nhfb,
  articleno =    "32",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Panda:2017:EEI,
  author =       "Priyadarshini Panda and Abhronil Sengupta and Kaushik
                 Roy",
  title =        "Energy-Efficient and Improved Image Recognition with
                 Conditional Deep Learning",
  journal =      j-JETC,
  volume =       "13",
  number =       "3",
  pages =        "33:1--33:??",
  month =        may,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3007192",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Tue Jul 11 17:10:31 MDT 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jetc/;
                 https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Deep-learning neural networks have proven to be very
                 successful for a wide range of recognition tasks across
                 modern computing platforms. However, the computational
                 requirements associated with such deep nets can be
                 quite high, and hence their energy-efficient
                 implementation is of great interest. Although,
                 traditionally, the entire network is utilized for the
                 recognition of all inputs, we observe that the
                 classification difficulty varies widely across inputs
                 in real-world datasets; only a small fraction of inputs
                 requires the full computational effort of a network,
                 while a large majority can be classified correctly with
                 very low effort. In this article, we propose
                 Conditional Deep Learning (CDL), where the
                 convolutional layer features are used to identify the
                 variability in the difficulty of input instances and
                 conditionally activate the deeper layers of the
                 network. We achieve this by cascading a linear network
                 of output neurons for each convolutional layer and
                 monitoring the output of the linear network to decide
                 whether classification can be terminated at the current
                 stage or not. The proposed methodology thus enables the
                 network to dynamically adjust the computational effort
                 depending on the difficulty of the input data while
                 maintaining competitive classification accuracy. The
                 overall energy benefits for MNIST/CIFAR10/Tiny ImageNet
                 datasets with state-of-the-art deep-learning
                 architectures are $ 1.84 \times $ / $ 2.83 \times $ / $
                 4.02 \times $, respectively. We further employ the
                 conditional approach to train deep-learning networks
                 from scratch with integrated supervision from the
                 additional output neurons appended at the intermediate
                 convolutional layers. Our proposed integrated CDL
                 training leads to an improvement in the gradient
                 convergence behavior giving substantial error rate
                 reduction on MNIST/CIFAR-10, resulting in improved
                 classification over state-of-the-art baseline
                 networks.",
  acknowledgement = ack-nhfb,
  articleno =    "33",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Karam:2017:MCR,
  author =       "Robert Karam and Somnath Paul and Ruchir Puri and
                 Swarup Bhunia",
  title =        "Memory-Centric Reconfigurable Accelerator for
                 Classification and Machine Learning Applications",
  journal =      j-JETC,
  volume =       "13",
  number =       "3",
  pages =        "34:1--34:??",
  month =        may,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2997649",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Tue Jul 11 17:10:31 MDT 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jetc/;
                 https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Big Data refers to the growing challenge of turning
                 massive, often unstructured datasets into meaningful,
                 organized, and actionable data. As datasets grow from
                 petabytes to exabytes and beyond, it becomes
                 increasingly difficult to run advanced analytics,
                 especially Machine Learning (ML) applications, in a
                 reasonable time and on a practical power budget using
                 traditional architectures. Previous work has focused on
                 accelerating analytics readily implemented as SQL
                 queries on data-parallel platforms, generally using
                 off-the-shelf CPUs and General Purpose Graphics
                 Processing Units (GPGPUs) for computation or
                 acceleration. However, these systems are
                 general-purpose and still require a vast amount of data
                 transfer between the storage devices and computing
                 elements, thus limiting the system efficiency. As an
                 alternative, this article presents a reconfigurable
                 memory-centric advanced analytics accelerator that
                 operates at the last level of memory and dramatically
                 reduces energy required for data transfer. We
                 functionally validate the framework using an FPGA-based
                 hardware emulation platform and three representative
                 applications: Na{\"\i}ve Bayesian Classification,
                 Convolutional Neural Networks, and k-Means Clustering.
                 Results are compared with implementations on a modern
                 CPU and workstation GPGPU. Finally, the use of
                 in-memory dataset decompression to further reduce data
                 transfer volume is investigated. With these techniques,
                 the system achieves an average energy efficiency
                 improvement of 74$ \times $ and 212$ \times $ over GPU
                 and single-threaded CPU, respectively, while dataset
                 compression is shown to improve overall efficiency by
                 an additional 1.8$ \times $ on average.",
  acknowledgement = ack-nhfb,
  articleno =    "34",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Yuan:2017:VAR,
  author =       "Bo Yuan and Keshab K. Parhi",
  title =        "{VLSI} Architectures for the {Restricted Boltzmann
                 Machine}",
  journal =      j-JETC,
  volume =       "13",
  number =       "3",
  pages =        "35:1--35:??",
  month =        may,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3007193",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Tue Jul 11 17:10:31 MDT 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jetc/;
                 https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Neural network (NN) systems are widely used in many
                 important applications ranging from computer vision to
                 speech recognition. To date, most NN systems are
                 processed by general processing units like CPUs or
                 GPUs. However, as the sizes of dataset and network
                 rapidly increase, the original software implementations
                 suffer from long training time. To overcome this
                 problem, specialized hardware accelerators are needed
                 to design high-speed NN systems. This article presents
                 an efficient hardware architecture of restricted
                 Boltzmann machine (RBM) that is an important category
                 of NN systems. Various optimization approaches at the
                 hardware level are performed to improve the training
                 speed. As-soon-as-possible and overlapped-scheduling
                 approaches are used to reduce the latency. It is shown
                 that, compared with the flat design, the proposed RBM
                 architecture can achieve 50\% reduction in training
                 time. In addition, an on-the-fly computation scheme is
                 also used to reduce the storage requirement of binary
                 and stochastic states by several hundreds of times.
                 Then, based on the proposed approach, a 784-2252 RBM
                 design example is developed for MNIST handwritten digit
                 recognition dataset. Analysis shows that the VLSI
                 design of RBM achieves significant improvement in
                 training speed and energy efficiency as compared to
                 CPU/GPU-based solution.",
  acknowledgement = ack-nhfb,
  articleno =    "35",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Ni:2017:DMC,
  author =       "Leibin Ni and Hantao Huang and Zichuan Liu and Rajiv
                 V. Joshi and Hao Yu",
  title =        "Distributed In-Memory Computing on Binary {RRAM}
                 Crossbar",
  journal =      j-JETC,
  volume =       "13",
  number =       "3",
  pages =        "36:1--36:??",
  month =        may,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2996192",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Tue Jul 11 17:10:31 MDT 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jetc/;
                 https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "The recently emerging resistive random-access memory
                 (RRAM) can provide nonvolatile memory storage but also
                 intrinsic computing for matrix-vector multiplication,
                 which is ideal for the low-power and high-throughput
                 data analytics accelerator performed in memory.
                 However, the existing RRAM crossbar--based computing is
                 mainly assumed as a multilevel analog computing, whose
                 result is sensitive to process nonuniformity as well as
                 additional overhead from AD-conversion and I/O. In this
                 article, we explore the matrix-vector multiplication
                 accelerator on a binary RRAM crossbar with adaptive
                 1-bit-comparator--based parallel conversion. Moreover,
                 a distributed in-memory computing architecture is also
                 developed with the according control protocol. Both
                 memory array and logic accelerator are implemented on
                 the binary RRAM crossbar, where the logic-memory pair
                 can be distributed with the control bus protocol.
                 Experimental results have shown that compared to the
                 analog RRAM crossbar, the proposed binary RRAM crossbar
                 can achieve significant area savings with better
                 calculation accuracy. Moreover, significant speedup can
                 be achieved for matrix-vector multiplication in neural
                 network--based machine learning such that the overall
                 training and testing time can be both reduced. In
                 addition, large energy savings can be also achieved
                 when compared to the traditional CMOS-based
                 out-of-memory computing architecture.",
  acknowledgement = ack-nhfb,
  articleno =    "36",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Merkel:2017:SCB,
  author =       "Cory Merkel and Dhireesha Kudithipudi and Manan Suri
                 and Bryant Wysocki",
  title =        "Stochastic {CBRAM}-Based Neuromorphic Time Series
                 Prediction System",
  journal =      j-JETC,
  volume =       "13",
  number =       "3",
  pages =        "37:1--37:??",
  month =        may,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2996193",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Tue Jul 11 17:10:31 MDT 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jetc/;
                 https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "In this research, we present a Conductive-Bridge RAM
                 (CBRAM)-based neuromorphic system which efficiently
                 addresses time series prediction. We propose a new (i)
                 voltage-mode, stochastic, multiweight synapse circuit
                 based on experimental bi-stable CBRAM devices, (ii) a
                 voltage-mode neuron circuit based on the concept of
                 charge sharing, and (iii) an optimized training
                 methodology powered by a stochastic implementation of
                 the Least-Mean-Squares (SLMS) training rule. To
                 validate the proposed design, we use time series
                 prediction for short-term electrical load forecasting
                 in smart grids. Our system is able to forecast hourly
                 electrical loads with a mean accuracy of 96\%, an
                 estimated power dissipation of 15 $ \mu $ W, and area
                 of 14.5 $ \mu m^2 $ at 65 nm CMOS technology.",
  acknowledgement = ack-nhfb,
  articleno =    "37",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Topaloglu:2017:EJS,
  author =       "Rasit O. Topaloglu and Naveen Verma",
  title =        "Editorial for {JETC} Special Issue on Alternative
                 Computing Systems",
  journal =      j-JETC,
  volume =       "13",
  number =       "3",
  pages =        "38:1--38:??",
  month =        may,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3022700",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Tue Jul 11 17:10:31 MDT 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jetc/;
                 https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  acknowledgement = ack-nhfb,
  articleno =    "38",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Britt:2017:HPC,
  author =       "Keith A. Britt and Travis S. Humble",
  title =        "High-Performance Computing with Quantum Processing
                 Units",
  journal =      j-JETC,
  volume =       "13",
  number =       "3",
  pages =        "39:1--39:??",
  month =        may,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3007651",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Tue Jul 11 17:10:31 MDT 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jetc/;
                 https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "The prospects of quantum computing have driven efforts
                 to realize fully functional quantum processing units
                 (QPUs). Recent success in developing proof-of-principle
                 QPUs has prompted the question of how to integrate
                 these emerging processors into modern high-performance
                 computing (HPC) systems. We examine how QPUs can be
                 integrated into current and future HPC system
                 architectures by accounting for functional and physical
                 design requirements. We identify two integration
                 pathways that are differentiated by infrastructure
                 constraints on the QPU and the use cases expected for
                 the HPC system. This includes a tight integration that
                 assumes infrastructure bottlenecks can be overcome as
                 well as a loose integration that assumes they cannot.
                 We find that the performance of both approaches is
                 likely to depend on the quantum interconnect that
                 serves to entangle multiple QPUs. We also identify
                 several challenges in assessing QPU performance for
                 HPC, and we consider new metrics that capture the
                 interplay between system architecture and the quantum
                 parallelism underlying computational performance.",
  acknowledgement = ack-nhfb,
  articleno =    "39",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Yoon:2017:MUM,
  author =       "Su-Kyung Yoon and Young-Sun Youn and Kihyun Park and
                 Shin-Dug Kim",
  title =        "Mobile Unified Memory-Storage Structure Based on
                 Hybrid Non-Volatile Memories",
  journal =      j-JETC,
  volume =       "13",
  number =       "3",
  pages =        "40:1--40:??",
  month =        may,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3007650",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Tue Jul 11 17:10:31 MDT 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jetc/;
                 https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "In mobile computing systems, the limited amount of
                 main memory space leads to page swap operation overhead
                 and data duplication in both main memory and secondary
                 storage. Furthermore, SQLite write operations in mobile
                 devices such as smartphones and tablet PCs tend to
                 frequently overwrite data to storage, significantly
                 degrading performance. Thus, this article presents a
                 unified memory-storage structure that is optimized for
                 mobile devices and blurs the boundary between the
                 existing main memory layer and secondary storage layer.
                 This structure can eliminate the conventional page-swap
                 operations that cause significant performance
                 degradation and support fast program execution time.
                 The unified memory-storage structure consists of a
                 dynamic RAM (DRAM) and phase change memory (PCM) -based
                 dual buffering module, a hybrid unified memory-storage
                 array consisting of DRAM and NAND Flash memory, and an
                 associated unified storage translation layer devised
                 for the memory address and file translation mechanism
                 as a system software module. This hybrid array of
                 non-volatile memories is formed as a single memory-disk
                 integrated storage space that can be logically divided
                 into static and dynamic spaces. Experimental results
                 show that the overall performance of the hybrid unified
                 memory-storage system with the buffering structure
                 increases by around 13\% and power consumption is also
                 improved by 35\%, compared to current mobile system.",
  acknowledgement = ack-nhfb,
  articleno =    "40",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Guha:2017:RTS,
  author =       "Krishnendu Guha and Debasri Saha and Amlan
                 Chakrabarti",
  title =        "Real-Time {SoC} Security against Passive Threats Using
                 Crypsis Behavior of Geckos",
  journal =      j-JETC,
  volume =       "13",
  number =       "3",
  pages =        "41:1--41:??",
  month =        may,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3014166",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Tue Jul 11 17:10:31 MDT 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jetc/;
                 https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "The rapid evolution of the embedded era has witnessed
                 globalization for the design of SoC architectures in
                 the semiconductor design industry. Though issues of
                 cost and stringent marketing deadlines have been
                 resolved in such a methodology, yet the root of
                 hardware trust has been evicted. Malicious circuitry,
                 a.k.a. Hardware Trojan Horse (HTH), is inserted by
                 adversaries in the less trusted phases of design. A HTH
                 remains dormant during testing but gets triggered at
                 runtime to cause sudden active and passive attacks. In
                 this work, we focus on the runtime passive threats
                 based on the parameter delay. Nature-inspired
                 algorithms offer an alternative to the conventional
                 techniques for solving complex problems in the domain
                 of computer science. However, most are optimization
                 techniques and none is dedicated to security. We seek
                 refuge to the crypsis behavior exhibited by geckos in
                 nature to generate a runtime security technique for SoC
                 architectures, which can bypass runtime passive threats
                 of a HTH. An adaptive security intellectual property
                 (IP) that works on the proposed security principles is
                 designed. Embedded timing analysis is used for
                 experimental validation. Low area and power overhead of
                 our proposed security IP over standard benchmarks and
                 practical crypto SoC architectures as obtained in
                 experimental results supports its applicability for
                 practical implementations.",
  acknowledgement = ack-nhfb,
  articleno =    "41",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Liu:2017:CPU,
  author =       "Yin Liu and Keshab K. Parhi",
  title =        "Computing Polynomials Using Unipolar Stochastic
                 Logic",
  journal =      j-JETC,
  volume =       "13",
  number =       "3",
  pages =        "42:1--42:??",
  month =        may,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3007648",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Tue Jul 11 17:10:31 MDT 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jetc/;
                 https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "This article addresses subtraction and polynomial
                 computations using unipolar stochastic logic.
                 Stochastic computing requires simple logic gates, and
                 stochastic logic--based circuits are inherently fault
                 tolerant. Thus, these structures are well suited for
                 nanoscale CMOS technologies. It is well known that an
                 AND gate and a multiplexer can be used to implement
                 stochastic unipolar multiplier and adder, respectively.
                 Although it is easy to realize multiplication and
                 scaled addition, implementation of subtraction is
                 nontrivial using unipolar stochastic logic.
                 Additionally, an accurate computation of subtraction is
                 critical for the implementation of polynomials with
                 negative coefficients in stochastic unipolar
                 representation. This work, for the first time,
                 demonstrates that instead of using well-known Bernstein
                 polynomials, stochastic computation of polynomials can
                 be implemented by using a stochastic subtractor and
                 factorization. Three major contributions are given in
                 this article. First, two approaches are proposed to
                 compute subtraction in stochastic unipolar
                 representation. In the first approach, the subtraction
                 operation is approximated by cascading multilevels of
                 OR and AND gates. The accuracy of the approximation is
                 improved with the increase in the number of stages. In
                 the second approach, the stochastic subtraction is
                 implemented using a multiplexer and a stochastic
                 divider. This approach requires more hardware
                 complexity due to the use of a linear-feedback shift
                 register and a counter for division. Second,
                 computation of polynomials in stochastic unipolar
                 format is presented using scaled addition and proposed
                 stochastic subtraction. Third, we propose stochastic
                 computation of polynomials using factorization.
                 Stochastic implementations of first- and second-order
                 factors are presented for different locations of
                 polynomial roots. From experimental results, it is
                 shown that the proposed stochastic logic circuits
                 require less hardware complexity than the previous
                 stochastic polynomial implementation using Bernstein
                 polynomials.",
  acknowledgement = ack-nhfb,
  articleno =    "42",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Golnari:2017:PCE,
  author =       "Pareesa Ameneh Golnari and Yavuz Yetim and Margaret
                 Martonosi and Yakir Vizel and Sharad Malik",
  title =        "{PPU}: a Control Error-Tolerant Processor for
                 Streaming Applications with Formal Guarantees",
  journal =      j-JETC,
  volume =       "13",
  number =       "3",
  pages =        "43:1--43:??",
  month =        may,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2990502",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Tue Jul 11 17:10:31 MDT 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jetc/;
                 https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "With increasing technology scaling and design
                 complexity there are increasing threats from device and
                 circuit failures. This is expected to worsen with
                 post-CMOS devices. Current error-resilient solutions
                 ensure reliability of circuits through protection
                 mechanisms such as redundancy, error correction, and
                 recovery. However, the costs of these solutions may be
                 high, rendering them impractical. In contrast,
                 error-tolerant solutions allow errors in the
                 computation and are positioned to be suitable for
                 error-tolerant applications such as media applications.
                 For such programmable error-tolerant processors, the
                 Instruction-Set-Architecture (ISA) no longer serves as
                 a specification since it is acceptable for the
                 processor to allow for errors during the execution of
                 instructions. In this work, we address this
                 specification gap by defining the basic requirements
                 needed for an error-tolerant processor to provide
                 acceptable results. Furthermore, we formally define
                 properties that capture these requirements. Based on
                 this, we propose the Partially Protected Uniprocessor
                 (PPU), an error-tolerant processor that aims to meet
                 these requirements with low-cost microarchitectural
                 support. These protection mechanisms convert
                 potentially fatal control errors to potentially
                 tolerable data errors instead of ensuring
                 instruction-level or byte-level correctness. The
                 protection mechanisms in PPU protect the system against
                 crashes, unresponsiveness, and external device
                 corruption. In addition, they also provide support for
                 achieving acceptable result quality. Additionally, we
                 provide a methodology that formally proves the
                 specification properties on PPU using model checking.
                 This methodology uses models for the hardware and
                 software that are integrated with the fault and
                 recovery models. Finally, we experimentally demonstrate
                 the results of model checking and the application-level
                 quality of results for PPU.",
  acknowledgement = ack-nhfb,
  articleno =    "43",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Gorantla:2017:DAC,
  author =       "Anusha Gorantla and Deepa P.",
  title =        "Design of Approximate Compressors for Multiplication",
  journal =      j-JETC,
  volume =       "13",
  number =       "3",
  pages =        "44:1--44:??",
  month =        may,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3007649",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Tue Jul 11 17:10:31 MDT 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jetc/;
                 https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Approximate computing is a promising technique for
                 energy-efficient Very Large Scale Integration (VLSI)
                 system design. It is best suited for error-resilient
                 applications such as signal processing and multimedia.
                 Approximate computing reduces accuracy but still
                 provides significant and faster results with lower
                 power consumption. This is attractive to arithmetic
                 circuits. In this article, various novel design
                 approaches of approximate 4-2 and 5-2 compressors have
                 been proposed for reduction of the partial product
                 stages in multiplication. Three approximate 8 $ \times
                 $ 8 Dadda multiplier designs using three novel
                 approximate 4-2 compressors and two approximate 8 $
                 \times $ 8 Dadda multiplier designs using two novel
                 approximate 5-2 compressors have proposed. The
                 synthesis results show that the proposed designs
                 achieved significant accuracy improvement together with
                 power and delay reductions compared to the existing
                 approximate designs.",
  acknowledgement = ack-nhfb,
  articleno =    "44",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Kumar:2017:THS,
  author =       "Arvind Kumar and Zhe Wan and Winfried W. Wilcke and
                 Subramanian S. Iyer",
  title =        "Toward Human-Scale Brain Computing Using {$3$D} Wafer
                 Scale Integration",
  journal =      j-JETC,
  volume =       "13",
  number =       "3",
  pages =        "45:1--45:??",
  month =        may,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2976742",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Tue Jul 11 17:10:31 MDT 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jetc/;
                 https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "The Von Neumann architecture, defined by strict and
                 hierarchical separation of memory and processor, has
                 been a hallmark of conventional computer design since
                 the 1940s. It is becoming increasingly unsuitable for
                 cognitive applications, which require massive parallel
                 processing of highly interdependent data. Inspired by
                 the brain, we propose a significantly different
                 architecture characterized by a large number of highly
                 interconnected simple processors intertwined with very
                 large amounts of low-latency memory. We contend that
                 this memory-centric architecture can be realized using
                 3D wafer scale integration for which the technology is
                 nearing readiness, combined with current CMOS device
                 technologies. The natural fault tolerance and lower
                 power requirements of neuromorphic processing make 3D
                 wafer stacking particularly attractive. In order to
                 assess the performance of this architecture, we propose
                 a specific embodiment of a neuronal system using 3D
                 wafer scale integration; formulate a simple model of
                 brain connectivity including short- and long-range
                 connections; and estimate the memory, bandwidth,
                 latency, and power requirements of the system using the
                 connectivity model. We find that 3D wafer scale
                 integration, combined with technologies nearing
                 readiness, offers the potential for scaleup to a
                 primate-scale brain, while further scaleup to a
                 human-scale brain would require significant additional
                 innovations.",
  acknowledgement = ack-nhfb,
  articleno =    "45",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Alawad:2017:SCS,
  author =       "Mohammed Alawad and Mingjie Lin",
  title =        "Sketching Computation with Stochastic Processing
                 Engines",
  journal =      j-JETC,
  volume =       "13",
  number =       "3",
  pages =        "46:1--46:??",
  month =        may,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3007652",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Tue Jul 11 17:10:31 MDT 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jetc/;
                 https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "This article explores how to leverage stochastic
                 principles to gracefully exploit partial computation
                 results, hence achieving quality-scalable embedded
                 computing. Our work is inspired by the concept of
                 incremental sketching frequently found in artistic
                 rendering, where the drawing procedure consists of a
                 series of steps, each gradually improving the quality
                 of results. The essence of our approach is to first
                 encode input signals as probability density functions
                 (PDFs), then perform stochastic computing operations on
                 all signals in the probabilistic domain, and finally
                 decode output signals by estimating the PDF of these
                 resulting random samples. Although numerous approximate
                 computing schemes exist, such as inaccurate adders and
                 multipliers that reduce bit width or weaken logic
                 circuit design, none of them can seamlessly improve
                 computing accuracy incrementally without making any
                 changes to the computing hardware at runtime.
                 Furthermore, in conventional embedded computing, a
                 sudden shortage of computing resources, such as
                 premature termination, often means a complete computing
                 failure and totally unusable results. Our sketching
                 computing scheme can readily trade off between the
                 quality of results and computing efforts without
                 modifying its circuit design. To validate our proposed
                 architecture design, we have implemented a
                 proof-of-concept computation sketching engine based on
                 a probabilistic convolver using a Virtex-6 FPGA device.
                 Using three widely deployed image processing
                 applications-image correspondence, image sharpening,
                 and edge detection-we have demonstrated that important
                 embedded computing applications can indeed be
                 ``sketched'' in a graceful manner using roughly one
                 third the hardware and one fifth the energy compared to
                 the traditional multiplier-based computing method.",
  acknowledgement = ack-nhfb,
  articleno =    "46",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Alaghi:2017:TAE,
  author =       "Armin Alaghi and Wei-Ting J. Chan and John P. Hayes
                 and Andrew B. Kahng and Jiajia Li",
  title =        "Trading Accuracy for Energy in Stochastic Circuit
                 Design",
  journal =      j-JETC,
  volume =       "13",
  number =       "3",
  pages =        "47:1--47:??",
  month =        may,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2990503",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Tue Jul 11 17:10:31 MDT 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jetc/;
                 https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "As we approach the limits of traditional Moore's-Law
                 scaling, alternative computing techniques that consume
                 energy more efficiently become attractive. Stochastic
                 computing (SC), as a re-emerging computing technique,
                 is a low-cost and error-tolerant alternative to
                 conventional binary circuits in several important
                 applications such as image processing and
                 communications. SC allows a natural accuracy-energy
                 tradeoff that has been exploited in the past. This
                 article presents an accuracy-energy tradeoff technique
                 for SC circuits that reduces their energy consumption
                 with virtually no accuracy loss. To this end, we employ
                 voltage or frequency scaling, which normally reduce
                 energy consumption at the cost of timing errors. Then
                 we show that due to their inherent error tolerance, SC
                 circuits operate satisfactorily without significant
                 accuracy loss even with aggressive scaling. This
                 significantly improves their energy efficiency. In
                 contrast, conventional binary circuits quickly fail as
                 the supply voltage decreases. To find the most
                 energy-efficient operating point of an SC circuit, we
                 propose an error estimation method that allows us to
                 quickly explore the circuit's design space. The error
                 estimation method is based on Markov chain and
                 least-squares regression. Furthermore, we investigate
                 opportunities to optimize SC circuits under such
                 aggressive scaling. We find that logical and physical
                 design techniques can be combined to significantly
                 expand the already-powerful accuracy-energy tradeoff
                 possibilities of SC. In particular, we demonstrate that
                 careful adjustment of path delays can lead to
                 significant error reduction under voltage and frequency
                 scaling. We perform buffer insertion and route
                 detouring to achieve more balanced path delays. These
                 techniques differ from conventional path-balancing
                 techniques whose goal is to minimize power consumption
                 by resizing the non-critical paths. The goal of our
                 path-balancing approach is to increase error
                 cancellation chances in voltage-/frequency-scaled SC
                 circuits. Our circuit optimization comprehends the
                 tradeoff between power overheads due to inserted
                 buffers and wires versus the energy reduction from
                 supply voltage downscaling enabled by more balanced
                 path delays. Simulation results show that our optimized
                 SC circuits can tolerate aggressive voltage scaling
                 with no significant signal-to-noise ratio (SNR)
                 degradation. In one example, a 40\% supply voltage
                 reduction (1V to 0.6V) on the SC circuit leads to 66\%
                 energy saving (20.7pJ to 6.9pJ) and makes it more
                 efficient than its conventional binary counterpart. In
                 the same example, a 100\% frequency boosting (400ps to
                 200ps) of the optimized circuits leads to no
                 significant SNR degradation. We also show that process
                 variation and temperature variation have limited impact
                 on optimized SC circuits. The error change is less than
                 5\% when temperature changes by 100${}^\circ $C or
                 process condition changes from worst case to best
                 case.",
  acknowledgement = ack-nhfb,
  articleno =    "47",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Salehi:2017:SSM,
  author =       "Soheil Salehi and Deliang Fan and Ronald F. Demara",
  title =        "Survey of {STT--MRAM} Cell Design Strategies: Taxonomy
                 and Sense Amplifier Tradeoffs for Resiliency",
  journal =      j-JETC,
  volume =       "13",
  number =       "3",
  pages =        "48:1--48:??",
  month =        may,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2997650",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Tue Jul 11 17:10:31 MDT 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jetc/;
                 https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Spin-Transfer Torque Random Access Memory (STT-MRAM)
                 has been explored as a post-CMOS technology for
                 embedded and data storage applications seeking
                 non-volatility, near-zero standby energy, and high
                 density. Towards attaining these objectives for
                 practical implementations, various techniques to
                 mitigate the specific reliability challenges associated
                 with STT-MRAM elements are surveyed, classified, and
                 assessed in this article. Cost and suitability metrics
                 assessed include the area of nanomagmetic and CMOS
                 components per bit, access time and complexity, sense
                 margin, and energy or power consumption costs versus
                 resiliency benefits. Solutions to the reliability
                 issues identified are addressed within a taxonomy
                 created to categorize the current and future approaches
                 to reliable STT-MRAM designs. A variety of destructive
                 and non-destructive sensing schemes are assessed for
                 process variation tolerance, read disturbance
                 reduction, sense margin, and write polarization
                 asymmetry compensation. The highest resiliency
                 strategies deliver a sensing margin above 300mV while
                 incurring low power and energy consumption on the order
                 of picojoules and microwatts, respectively, and
                 attaining read sense latency of a few nanoseconds down
                 to hundreds of picoseconds for non-destructive and
                 destructive sensing schemes, respectively.",
  acknowledgement = ack-nhfb,
  articleno =    "48",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Yu:2017:RMA,
  author =       "Songping Yu and Nong Xiao and Mingzhu Deng and Fang
                 Liu and Wei Chen",
  title =        "Redesign the Memory Allocator for Non-Volatile Main
                 Memory",
  journal =      j-JETC,
  volume =       "13",
  number =       "3",
  pages =        "49:1--49:??",
  month =        may,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2997651",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Tue Jul 11 17:10:31 MDT 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jetc/;
                 https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "The non-volatile memory (NVM) has the merits of
                 byte-addressability, fast speed, persistency and low
                 power consumption, which make it attractive to be used
                 as main memory. Commonly, user process dynamically
                 acquires memory through memory allocators. However,
                 traditional memory allocators designed with in-place
                 data writes are not appropriate for the non-volatile
                 main memory (NVRAM) due to the limited endurance. In
                 this article, first, we quantitatively analyze the
                 wear-oblivious of DRAM-oriented designed
                 allocator-glibc malloc and the inefficiency of
                 wear-conscious allocator NVMalloc. Then, we propose
                 WAlloc, an efficient wear-aware manual memory allocator
                 designed for NVRAM: (1) decouples metadata and data
                 management; (2) distinguishes metadata with volatility;
                 (3) redirects the data writes around to achieve
                 wear-leveling; (4) redesigns an efficient and effective
                 NVM copy mechanism, bypassing the CPU cache partially
                 and prefetching data explicitly. Finally, experimental
                 results show that the wear-leveling of WAlloc
                 outperforms that of NVMalloc about 30\% and 60\% under
                 random workloads and well-distributed workloads,
                 respectively. Besides, WAlloc reduces the average data
                 memory writes in 64 bytes block by 1.5 times comparing
                 with glibc malloc. With the fulfillment of data
                 persistency, cache bypassing NVM copy is better than
                 cache line flushing NVM copy with performance
                 improvement circa 14\%.",
  acknowledgement = ack-nhfb,
  articleno =    "49",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Li:2017:PUD,
  author =       "Bing Li and Yu Hu and Ying Wang and Jing Ye and
                 Xiaowei Li",
  title =        "Power-Utility-Driven Write Management for {MLC PCM}",
  journal =      j-JETC,
  volume =       "13",
  number =       "3",
  pages =        "50:1--50:??",
  month =        may,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2997648",
  ISSN =         "1550-4832 (print), 1550-4840 (electronic)",
  ISSN-L =       "1550-4832",
  bibdate =      "Tue Jul 11 17:10:31 MDT 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jetc/;
                 https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Phase change memory (PCM) is a promising alternative
                 to Dynamic Random Access Memory (DRAM) as main memory
                 due to its merits of high density and low leakage
                 power. Multi-level Cell (MLC) PCM is more attractive
                 than Single-level Cell (SLC) PCM, because it can store
                 multiple bits per cell to achieve higher density and
                 lower per-bit cost. With the iterative program-verify
                 write technique, MLC PCM writes demand at much higher
                 power than DRAM writes, while the power supply system
                 of MLC memory system is similar to that of DRAM, and
                 the power capability is limited. The incompatibility of
                 high write power and limited power budget results in
                 the degradation of the write throughput and performance
                 in MLC PCM. In this work, we investigate both write
                 scheduling policy and power management to improve the
                 MLC power utility and alleviate the negative impacts
                 induced by high write power. We identify the
                 power-utility-driven write scheduling as an online
                 bin-packing problem and then derive a
                 power-utility-driven scheduling (PUDS) policy from the
                 First Fit algorithm to improve the write power usage.
                 Based on the ramp-down characteristic of the SET pulse
                 (the pulse changes the PCM to high resistance), we
                 propose the SET Power Amortization (SPA) policy, which
                 proactively reclaims the power tokens at the intra-SET
                 level to promote the power utilization. Our
                 experimental results demonstrate that the PUDS and SPA
                 respectively achieve 24\% and 27\% performance
                 improvement over the state-of-the-art power management
                 technique, and the PUDS8SPA has an overall 31\%
                 improvement of the power utility and 50\% increase of
                 performance compared to the baseline system.",
  acknowledgement = ack-nhfb,
  articleno =    "50",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Ghosh:2017:AQC,
  author =       "Mrityunjay Ghosh and Amlan Chakrabarti and Niraj K.
                 Jha",
  title =        "Automated Quantum Circuit Synthesis and Cost
                 Estimation for the Binary Welded Tree Oracle",
  journal =      j-JETC,
  volume =       "13",
  number =       "4",
  pages =        "51:1--51:??",
  month =        aug,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3060582",
  ISSN =         "1550-4832",
  bibdate =      "Sat Aug 12 09:05:32 MDT 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jetc/;
                 https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Quantum computing is a new computational paradigm that
                 promises an exponential speed-up over classical
                 algorithms. To develop efficient quantum algorithms for
                 problems of a non-deterministic nature, random walk is
                 one of the most successful concepts employed. In this
                 article, we target both continuous-time and
                 discrete-time random walk in both the classical and
                 quantum regimes. Binary Welded Tree (BWT), or glued
                 tree, is one of the most well-known quantum walk
                 algorithms in the continuous-time domain. Prior work
                 implements quantum walk on the BWT with static welding.
                 In this context, static welding is randomized but
                 case-specific. We propose a solution to automatically
                 generate the circuit for the Oracle for welding. We
                 implement the circuit using the Quantum Assembly
                 Language, which is a language for describing quantum
                 circuits. We then optimize the generated circuit using
                 the Fault-Tolerant Quantum Logic Synthesis tool for any
                 BWT instance. Automatic welding enables us to provide a
                 generalized solution for quantum walk on the BWT.",
  acknowledgement = ack-nhfb,
  articleno =    "51",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Govindaraj:2017:DAS,
  author =       "Rekha Govindaraj and Swaroop Ghosh",
  title =        "Design and Analysis of {STTRAM}-Based Ternary Content
                 Addressable Memory Cell",
  journal =      j-JETC,
  volume =       "13",
  number =       "4",
  pages =        "52:1--52:??",
  month =        aug,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3060578",
  ISSN =         "1550-4832",
  bibdate =      "Sat Aug 12 09:05:32 MDT 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jetc/;
                 https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Content Addressable Memory (CAM) is widely used in
                 applications where searching a specific pattern of data
                 is a major operation. Conventional CAMs suffer from
                 area, power, and speed limitations. We propose
                 Spin-Torque Transfer RAM--based Ternary CAM (TCAM)
                 cells. The proposed NOR-type TCAM cell has a 62.5\%
                 (33\%) reduction in number of transistor compared to
                 conventional CMOS TCAMs (spintronic TCAMs). We analyzed
                 the sense margin of the proposed TCAM with respect to
                 16-, 32-, 64-, 128-, and 256-bit word sizes in 22nm
                 predictive technology. Simulations indicated a reliable
                 sense margin of 50mV even at 0.7V supply voltage for
                 256-bits word. We also explored a selective threshold
                 voltage modulation of transistors to improve the sense
                 margin and tolerate process and voltage variations. The
                 worst-case search latency and sense margin of 256-bit
                 TCAM is found to be 263ps and 220mV, respectively, at
                 1V supply voltage. The average search power consumed is
                 13mW, and the search energy is 4.7fJ/bit search. The
                 write time is 4ns, and the write energy is 0.69pJ/bit.
                 We leverage the NOR-type TCAM design to realize a 9T-2
                 Magnetic Tunnel Junctions NAND-type TCAM cell that has
                 43.75\% less number of transistors than the
                 conventional CMOS TCAM cell. A NAND-type cell can
                 support up to 64-bit words with a maximum sense margin
                 of up to 33mV. We compare the performance metrics of
                 NOR- and NAND-type TCAM cells with other TCAMs in the
                 literature.",
  acknowledgement = ack-nhfb,
  articleno =    "52",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Peter:2017:OON,
  author =       "Eldhose Peter and Anuj Arora and Janibul Bashir and
                 Akriti Bagaria and Smruti R. Sarangi",
  title =        "Optical Overlay {NUCA}: a High-Speed Substrate for
                 Shared {L2} Caches",
  journal =      j-JETC,
  volume =       "13",
  number =       "4",
  pages =        "53:1--53:??",
  month =        aug,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3064833",
  ISSN =         "1550-4832",
  bibdate =      "Sat Aug 12 09:05:32 MDT 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jetc/;
                 https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "In this article, we propose using optical
                 networks-on-chip (NoCs) to design cache access
                 protocols for large shared L2 caches. We observe that
                 the problem is unique because optical networks have
                 very low latency, and in principle all of the cache
                 banks are very close to each other. A naive approach is
                 to broadcast a request to a set of banks that might
                 possibly contain the copy of a block. However, this
                 approach is wasteful in terms of energy and bandwidth.
                 Hence, we propose a set of novel schemes that create a
                 set of virtual networks ( overlays ) of cache banks
                 over a physical optical NoC. We search for a block
                 inside each overlay using a combination of multicast
                 and unicast messages. We first propose two simple
                 protocols: TSI and Broadcast. The former uses unicast
                 messages, and the latter uses multicast messages. We
                 subsequently propose an improved scheme, OP\_BCAST,
                 that combines the best of TSI and Broadcast, and mainly
                 uses restricted multicast messages. Then we propose a
                 set of novel hardware structures for creating and
                 managing overlays, for efficiently locating blocks in
                 the overlay, and for implementing dynamically changing
                 overlays with OP\_BCAST. The performance of the TSI
                 scheme is within 2\% to 3\% of a broadcast scheme, and
                 it is faster than traditional schemes with electrical
                 networks by 26\%. Compared to the broadcast scheme, it
                 reduces the number of accesses, and consequently the
                 dynamic energy of the caches by 6\% to 8\%. OP\_BCAST
                 is 34\% faster than the best solutions with
                 copper-based NoCs; moreover, it reduces the dynamic
                 energy for cache access by 33\% compared to the TSI
                 scheme.",
  acknowledgement = ack-nhfb,
  articleno =    "53",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Koneru:2017:IEC,
  author =       "Abhishek Koneru and Sukeshwar Kannan and Krishnendu
                 Chakrabarty",
  title =        "Impact of Electrostatic Coupling and Wafer-Bonding
                 Defects on Delay Testing of Monolithic {$3$D}
                 Integrated Circuits",
  journal =      j-JETC,
  volume =       "13",
  number =       "4",
  pages =        "54:1--54:??",
  month =        aug,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3041026",
  ISSN =         "1550-4832",
  bibdate =      "Sat Aug 12 09:05:32 MDT 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jetc/;
                 https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Monolithic three-dimensional (M3D) integration is
                 gaining momentum, as it has the potential to achieve
                 significantly higher device density compared to 3D
                 integration based on through-silicon vias. M3D
                 integration uses several techniques that are not used
                 in the fabrication of conventional integrated circuits
                 (ICs). Therefore, a detailed analysis of the M3D
                 fabrication process is required to understand the
                 impact of defects that are likely to occur during chip
                 fabrication. In this article, we first analyze
                 electrostatic coupling in M3D ICs, which arises due to
                 the aggressive scaling of the interlayer dielectric
                 (ILD) thickness. We then analyze defects that arise due
                 to voids created during wafer bonding, a key step in
                 most M3D fabrication processes. We quantify the impact
                 of these defects on the threshold voltage of a
                 top-layer transistor in an M3D IC. We also show that
                 wafer-bonding defects can lead to a change in the
                 resistance of interlayer vias (ILVs), and in some cases
                 lead to an open in an ILV or a short between two ILVs.
                 We then analyze the impact of these defects on path
                 delays using HSpice simulations. We study their impact
                 on the effectiveness of delay-test patterns for
                 multiple instances of IWLS 2005 benchmarks in which
                 these defects were randomly injected. Our results show
                 that the timing characteristics of an M3D IC can be
                 significantly altered due to coupling and wafer-bonding
                 defects if the thickness of its ILD is less than 100nm.
                 Therefore, for such M3D ICs, test-generation methods
                 must be enhanced to take M3D fabrication defects into
                 account.",
  acknowledgement = ack-nhfb,
  articleno =    "54",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Houshmand:2017:QCS,
  author =       "Mahboobeh Houshmand and Mehdi Sedighi and Morteza
                 Saheb Zamani and Kourosh Marjoei",
  title =        "Quantum Circuit Synthesis Targeting to Improve One-Way
                 Quantum Computation Pattern Cost Metrics",
  journal =      j-JETC,
  volume =       "13",
  number =       "4",
  pages =        "55:1--55:??",
  month =        aug,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3064834",
  ISSN =         "1550-4832",
  bibdate =      "Sat Aug 12 09:05:32 MDT 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jetc/;
                 https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "One-way quantum computation (1WQC) is a model of
                 universal quantum computations in which a specific
                 highly entangled state called a cluster state allows
                 for quantum computation by single-qubit measurements.
                 The needed computations in this model are organized as
                 measurement patterns. The traditional approach to
                 obtain a measurement pattern is by translating a
                 quantum circuit that solely consists of CZ and $
                 J(\alpha) $ gates into the corresponding measurement
                 patterns and then performing some optimizations by
                 using techniques proposed for the 1WQC model. However,
                 in these cases, the input of the problem is a quantum
                 circuit, not an arbitrary unitary matrix. Therefore, in
                 this article, we focus on the first phase-that is,
                 decomposing a unitary matrix into CZ and $ J(\alpha) $
                 gates. Two well-known quantum circuit synthesis
                 methods, namely cosine-sine decomposition and quantum
                 Shannon decomposition are considered and then adapted
                 for a library of gates containing CZ and $ J(\alpha) $,
                 equipped with optimizations. By exploring the solution
                 space of the combinations of these two methods in a
                 bottom-up approach of dynamic programming, a
                 multiobjective quantum circuit synthesis method is
                 proposed that generates a set of quantum circuits. This
                 approach attempts to simultaneously improve the
                 measurement pattern cost metrics after the translation
                 from this set of quantum circuits.",
  acknowledgement = ack-nhfb,
  articleno =    "55",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Yogendra:2017:CST,
  author =       "Karthik Yogendra and Chamika Liyanagedera and Deliang
                 Fan and Yong Shim and Kaushik Roy",
  title =        "Coupled Spin-Torque Nano-Oscillator-Based Computation:
                 a Simulation Study",
  journal =      j-JETC,
  volume =       "13",
  number =       "4",
  pages =        "56:1--56:??",
  month =        aug,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3064835",
  ISSN =         "1550-4832",
  bibdate =      "Sat Aug 12 09:05:32 MDT 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jetc/;
                 https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "In this article, we present a comprehensive study of
                 four frequency locking mechanisms in Spin Torque Nano
                 Oscillators (STNOs) and explore their suitability for a
                 class of specialized computing applications. We
                 implemented a physical STNO model based on
                 Landau--Lifshitz--Gilbert-Slonczewski equation and
                 benchmarked the model to experimental data. Based on
                 our simulations, we provide an in-depth analysis of how
                 the ``self-organizing'' ability of coupled STNO array
                 can be effectively used for computations that are
                 unsuitable or inefficient in the von-Neumann computing
                 domain. As a case study, we demonstrate the computing
                 ability of coupled STNOs with two applications: edge
                 detection of an image and associative computing for
                 image recognition. We provide an analysis of the
                 scaling trends of STNOs and the effectiveness of
                 different frequency locking mechanisms with scaling in
                 the presence of thermal noise. We also provide an
                 in-depth analysis of the effect of variations on the
                 four locking mechanisms to find the most robust one in
                 the presence of variations.",
  acknowledgement = ack-nhfb,
  articleno =    "56",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Najafi:2017:RAS,
  author =       "M. Hassan Najafi and Peng Li and David J. Lilja and
                 Weikang Qian and Kia Bazargan and Marc Riedel",
  title =        "A Reconfigurable Architecture with Sequential
                 Logic-Based Stochastic Computing",
  journal =      j-JETC,
  volume =       "13",
  number =       "4",
  pages =        "57:1--57:??",
  month =        aug,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3060537",
  ISSN =         "1550-4832",
  bibdate =      "Sat Aug 12 09:05:32 MDT 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jetc/;
                 https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Computations based on stochastic bit streams have
                 several advantages compared to deterministic binary
                 radix computations, including low power consumption,
                 low hardware cost, high fault tolerance, and skew
                 tolerance. To take advantage of this computing
                 technique, previous work proposed a combinational
                 logic-based reconfigurable architecture to perform
                 complex arithmetic operations on stochastic streams of
                 bits. The long execution time and the cost of
                 converting between binary and stochastic
                 representations, however, make the stochastic
                 architectures less energy efficient than the
                 deterministic binary implementations. This article
                 introduces a methodology for synthesizing a given
                 target function stochastically using finite-state
                 machines (FSMs), and enhances and extends the
                 reconfigurable architecture using sequential logic.
                 Compared to the previous approach, the proposed
                 reconfigurable architecture can save hardware area and
                 energy consumption by up to 30\% and 40\%,
                 respectively, while achieving a higher processing
                 speed. Both stochastic reconfigurable architectures are
                 much more tolerant of soft errors (bit flips) than the
                 deterministic binary radix implementations, and their
                 fault tolerance scales gracefully to very large numbers
                 of errors.",
  acknowledgement = ack-nhfb,
  articleno =    "57",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Chittamuru:2017:SRS,
  author =       "Sai Vineel Reddy Chittamuru and Srinivas Desai and
                 Sudeep Pasricha",
  title =        "{SWIFTNoC}: a Reconfigurable Silicon-Photonic Network
                 with Multicast-Enabled Channel Sharing for Multicore
                 Architectures",
  journal =      j-JETC,
  volume =       "13",
  number =       "4",
  pages =        "58:1--58:??",
  month =        aug,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3060517",
  ISSN =         "1550-4832",
  bibdate =      "Sat Aug 12 09:05:32 MDT 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jetc/;
                 https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "On-chip communication is widely considered to be one
                 of the major performance bottlenecks in contemporary
                 chip multiprocessors (CMPs). With recent advances in
                 silicon nanophotonics, photonics-based network-on-chip
                 (NoC) architectures are being considered as a viable
                 solution to support communication in future CMPs as
                 they can enable higher bandwidth and lower power
                 dissipation compared to traditional electrical NoCs. In
                 this article, we present SwiftNoC, a novel
                 reconfigurable silicon-photonic NoC architecture that
                 features improved multicast-enabled channel sharing, as
                 well as dynamic re-prioritization and exchange of
                 bandwidth between clusters of cores running multiple
                 applications, to increase channel utilization and
                 system performance. Experimental results show that
                 SwiftNoC improves throughput by up to $ 25.4 \times $
                 while reducing latency by up to 72.4\% and
                 energy-per-bit by up to 95\% over state-of-the-art
                 solutions.",
  acknowledgement = ack-nhfb,
  articleno =    "58",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Samal:2017:IPU,
  author =       "Sandeep Kumar Samal and Guoqing Chen and Sung Kyu
                 Lim",
  title =        "Improving Performance under Process and Voltage
                 Variations in Near-Threshold Computing Using {$3$D}
                 {ICs}",
  journal =      j-JETC,
  volume =       "13",
  number =       "4",
  pages =        "59:1--59:??",
  month =        aug,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3060579",
  ISSN =         "1550-4832",
  bibdate =      "Sat Aug 12 09:05:32 MDT 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jetc/;
                 https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Near-threshold computing (NTC) circuits have been
                 shown to offer significant energy efficiency and power
                 benefits but with a huge performance penalty. This
                 performance loss exacerbates if process and voltage
                 variations are considered. In this article, we
                 demonstrate that three-dimensional (3D) IC technology
                 can overcome this limitation. We present a detailed
                 case study with a 28nm commercial-grade core at 0.6V
                 operation optimized with various 3D IC physical design
                 methods. First, our study under the deterministic case
                 shows that 3D IC NTC design outperforms 2D IC NTC by
                 29.5\% in terms of performance at comparable energy.
                 This is significantly higher than the 12.8\%
                 performance benefit of 3D IC at nominal voltage
                 supplies due to higher delay sensitivity to input slew
                 at lower voltages. Second, it is well demonstrated that
                 transistor delay is more sensitive to voltage changes
                 at NTC operation. However, our full-chip study reveals
                 that IR drop effect on 2D/3D IC NTC performance is not
                 severe due to the low power consumption and hence lower
                 IR drop values. Third, die-to-die variation impact on
                 full-chip performance is visible in 3D IC NTC designs,
                 but it is not worse compared to 2D IC NTC designs. This
                 is mainly due to the shorter critical path length in 3D
                 IC NTC designs.",
  acknowledgement = ack-nhfb,
  articleno =    "59",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Jiang:2017:RCC,
  author =       "Honglan Jiang and Cong Liu and Leibo Liu and Fabrizio
                 Lombardi and Jie Han",
  title =        "A Review, Classification, and Comparative Evaluation
                 of Approximate Arithmetic Circuits",
  journal =      j-JETC,
  volume =       "13",
  number =       "4",
  pages =        "60:1--60:??",
  month =        aug,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3094124",
  ISSN =         "1550-4832",
  bibdate =      "Sat Aug 12 09:05:32 MDT 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jetc/;
                 https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Often as the most important arithmetic modules in a
                 processor, adders, multipliers, and dividers determine
                 the performance and energy efficiency of many computing
                 tasks. The demand of higher speed and power efficiency,
                 as well as the feature of error resilience in many
                 applications (e.g., multimedia, recognition, and data
                 analytics), have driven the development of approximate
                 arithmetic design. In this article, a review and
                 classification are presented for the current designs of
                 approximate arithmetic circuits including adders,
                 multipliers, and dividers. A comprehensive and
                 comparative evaluation of their error and circuit
                 characteristics is performed for understanding the
                 features of various designs. By using approximate
                 multipliers and adders, the circuit for an image
                 processing application consumes as little as 47\% of
                 the power and 36\% of the power-delay product of an
                 accurate design while achieving similar image
                 processing quality. Improvements in delay, power, and
                 area are obtained for the detection of differences in
                 images by using approximate dividers.",
  acknowledgement = ack-nhfb,
  articleno =    "60",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Li:2017:EEC,
  author =       "Hui Li and S{\'e}bastien {Le Beux} and Martha Johanna
                 Sepulveda and Ian O'Connor",
  title =        "Energy-Efficiency Comparison of Multi-Layer Deposited
                 Nanophotonic Crossbar Interconnects",
  journal =      j-JETC,
  volume =       "13",
  number =       "4",
  pages =        "61:1--61:??",
  month =        aug,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3094125",
  ISSN =         "1550-4832",
  bibdate =      "Sat Aug 12 09:05:32 MDT 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jetc/;
                 https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Single-layer optical crossbar interconnections based
                 on Wavelength Division Multiplexing stand among other
                 nanophotonic interconnects because of their low latency
                 and low power. However, such architectures suffer from
                 a poor scalability due to losses induced by long
                 propagation distances on waveguides and waveguide
                 crossings. Multi-layer deposited silicon technology
                 allows the stacking of optical layers that are
                 connected by means of Optical Vertical Couplers. This
                 allows significant reduction in the optical losses,
                 which contributes to improve the interconnect
                 scalability but also leads to new challenges related to
                 network designs and layouts. In this article, we
                 investigate the design of optical crossbars using
                 multi-layer silicon deposited technology. We propose
                 implementations for Ring-, Matrix-, $ \lambda
                 $-router-, and Snake-based topologies. Layouts avoiding
                 waveguide crossings are compared to those minimizing
                 the waveguide length according to worst-case and
                 average losses. The laser output power is estimated
                 from the losses, which allows us to evaluate the energy
                 efficiency improvement induced by multi-layer
                 technology over traditional planar implementations
                 (33\% on average). Finally, networks comparison has
                 been carried out and the results show that the ring
                 topology leads to a 43\% reduction in the laser output
                 power.",
  acknowledgement = ack-nhfb,
  articleno =    "61",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Gala:2018:ATN,
  author =       "Neel Gala and Sarada Krithivasan and Wei-Yu Tsai and
                 Xueqing Li and Vijaykrishnan Narayanan and V.
                 Kamakoti",
  title =        "An Accuracy Tunable Non-{Boolean} Co-Processor Using
                 Coupled Nano-Oscillators",
  journal =      j-JETC,
  volume =       "14",
  number =       "1",
  pages =        "1:1--1:??",
  month =        mar,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3094263",
  ISSN =         "1550-4832",
  bibdate =      "Thu Nov 1 16:44:40 MDT 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "As we enter an era witnessing the closer end of
                 Dennard scaling, where further reduction in power
                 supply-voltage to reduce power consumption becomes more
                 challenging in conventional systems, a goal of
                 developing a system capable of performing large
                 computations with minimal area and power overheads
                 needs more optimization aspects. A rigorous exploration
                 of alternate computing techniques, which can mitigate
                 the limitations of Complementary Metal-Oxide
                 Semiconductor (CMOS) technology scaling and
                 conventional Boolean systems, is imperative. Reflecting
                 on these lines of thought, in this article we explore
                 the potential of non-Boolean computing employing
                 nano-oscillators for performing varied functions. We
                 use a two coupled nano-oscillator as our basic
                 computational model and propose an architecture for a
                 non-Boolean coupled oscillator based co-processor
                 capable of executing certain functions that are
                 commonly used across a variety of approximate
                 application domains. The proposed architecture includes
                 an accuracy tunable knob, which can be tuned by the
                 programmer at runtime. The functionality of the
                 proposed co-processor is verified using a soft coupled
                 oscillator model based on Kuramoto oscillators. The
                 article also demonstrates how real-world applications
                 such as Vector Quantization, Digit Recognition,
                 Structural Health Monitoring, and the like, can be
                 deployed on the proposed model. The proposed
                 co-processor architecture is generic in nature and can
                 be implemented using any of the existing modern day
                 nano-oscillator technologies such as Resonant Body
                 Transistors (RBTs), Spin-Torque Nano-Oscillators
                 (STNOs), and Metal-Insulator Transition (MITs). In
                 this article, we perform a validation of the proposed
                 architecture using the HyperField Effect Transistor
                 (FET) technology-based coupled oscillators, which
                 provide improvements of up to $ 3.5 \times $ increase
                 in clock speed and up to $ 10.75 \times $ and $ 14.12
                 \times $ reduction in area and power consumption,
                 respectively, as compared to a conventional Boolean
                 CMOS accelerator executing the same functions.",
  acknowledgement = ack-nhfb,
  articleno =    "1",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Uddin:2018:DCM,
  author =       "Mesbah Uddin and MD. Badruddoja Majumder and Karsten
                 Beckmann and Harika Manem and Zahiruddin Alamgir and
                 Nathaniel C. Cady and Garrett S. Rose",
  title =        "Design Considerations for Memristive Crossbar Physical
                 Unclonable Functions",
  journal =      j-JETC,
  volume =       "14",
  number =       "1",
  pages =        "2:1--2:??",
  month =        mar,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3094414",
  ISSN =         "1550-4832",
  bibdate =      "Thu Nov 1 16:44:40 MDT 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Hardware security has emerged as a field concerned
                 with issues such as integrated circuit (IC)
                 counterfeiting, cloning, piracy, and reverse
                 engineering. Physical unclonable functions (PUF) are
                 hardware security primitives useful for mitigating such
                 issues by providing hardware-specific fingerprints
                 based on intrinsic process variations within individual
                 IC implementations. As technology scaling progresses
                 further into the nanometer region, emerging
                 nanoelectronic technologies, such as memristors or
                 RRAMs (resistive random-access memory), have become
                 interesting options for emerging computing systems. In
                 this article, using a comprehensive temperature
                 dependent model of an HfO$_x$ (hafnium-oxide)
                 memristor, based on experimental measurements, we
                 explore the best region of operation for a memristive
                 crossbar PUF (XbarPUF). The design considered also
                 employs XORing and a column shuffling technique to
                 improve reliability and resilience to machine learning
                 attacks. We present a detailed analysis for the noise
                 margin and discuss the scalability of the XbarPUF
                 structure. Finally, we present results for estimates of
                 area, power, and delay alongside security performance
                 metrics to analyze the strengths and weaknesses of the
                 XbarPUF. Our XbarPUF exhibits nearly ideal (near 50\%)
                 uniqueness, bit-aliasing and uniformity, good
                 reliability of 90\% and up (with 100\% being ideal), a
                 very small footprint, and low average power consumption
                 $ \approx 104 \mu $W.",
  acknowledgement = ack-nhfb,
  articleno =    "2",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Yu:2018:SOF,
  author =       "Ye Yu and Niraj K. Jha",
  title =        "Statistical Optimization of {FinFET} Processor
                 Architectures under {PVT} Variations Using Dual
                 Device-Type Assignment",
  journal =      j-JETC,
  volume =       "14",
  number =       "1",
  pages =        "3:1--3:??",
  month =        mar,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3110714",
  ISSN =         "1550-4832",
  bibdate =      "Thu Nov 1 16:44:40 MDT 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "With semiconductor technology scaling to the 22nm node
                 and beyond, fin field-effect transistor (FinFET) has
                 started replacing complementary metal-oxide
                 semiconductor (CMOS), thanks to its superior control of
                 short-channel effects and much lower leakage current.
                 However, process, supply voltage, and temperature (PVT)
                 variations across the integrated circuit (IC) become
                 worse with technology scaling. Thus, to analyze timing,
                 leakage power, and dynamic power under PVT variations,
                 statistical analysis/optimization techniques are more
                 suitable than traditional static timing/power analysis
                 and optimization counterparts. In this article, we
                 propose a statistical optimization framework using dual
                 device-type assignment at the architecture level under
                 PVT variations that takes spatial correlations into
                 account and leverages circuit-level statistical
                 analysis techniques. To the best of our knowledge, this
                 is the first work to study statistical optimization at
                 the system level under PVT variations. Simulation
                 results show that leakage power yield and dynamic power
                 yield at the mean value of the baseline can be improved
                 by up to 44.2\% and 21.2\%, respectively, with no loss
                 in timing yield for a single-core processor and up to
                 43.0\% and 50.0\%, respectively, without any loss in
                 timing yield for an 8-core chip multiprocessor (CMP),
                 at little area overhead. Under the same (99.0\%) power
                 yield constraints, leakage power and dynamic power are
                 reduced by up to 91.2\% and 4.3\%, respectively, for a
                 single-core processor, and up to 44.6\% and 12.5\%,
                 respectively, for an 8-core CMP, with no loss in timing
                 yield. We also show that optimizations performed
                 without taking module-to-module and core-to-core
                 spatial correlations into account overestimate yield,
                 establishing the importance of taking such correlations
                 into account.",
  acknowledgement = ack-nhfb,
  articleno =    "3",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Hajkazemi:2018:HHM,
  author =       "Mohammad Hossein Hajkazemi and Mohammad Khavari Tavana
                 and Tinoosh Mohsenin and Houman Homayoun",
  title =        "Heterogeneous {HMC + DDRx} Memory Management for
                 Performance-Temperature Tradeoffs",
  journal =      j-JETC,
  volume =       "14",
  number =       "1",
  pages =        "4:1--4:??",
  month =        mar,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3106233",
  ISSN =         "1550-4832",
  bibdate =      "Thu Nov 1 16:44:40 MDT 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Three-dimensional DRAMs (3D-DRAMs) are emerging as a
                 promising solution to address the memory wall problem
                 in computer systems. However, high fabrication cost per
                 bit and thermal issues are the main reasons that
                 prevent architects from using 3D-DRAM alone as the main
                 memory building block. In this article, we address this
                 issue by proposing a heterogeneous memory system that
                 combines a double data rate (DDRx) DRAM with an
                 emerging 3D hybrid memory cube (HMC) technology.
                 Bandwidth and temperature management are the
                 challenging issues for this heterogeneous memory
                 architecture. To address these challenges, first we
                 introduce a memory page allocation policy for the
                 heterogeneous memory system to maximize performance.
                 Then, using the proposed policy, we introduce a
                 temperature-aware algorithm that dynamically
                 distributes the requested bandwidth between HMC and
                 DDRx DRAM to reduce the thermal hotspot while
                 maintaining high performance. We take into account the
                 impact of both core count and HMC channel count on
                 performance while using the proposed policies. The
                 results show that the proposed memory page allocation
                 policy can utilize the memory bandwidth close to 99\%
                 of the ideal bandwidth utilization. Moreover, our
                 temperate-aware bandwidth adaptation reduces the
                 average steady-state temperature of the HMC hotspot
                 across various workloads by 4.5 K while incurring 2.5\%
                 performance overhead.",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Bhattacharjee:2018:RFT,
  author =       "Sukanta Bhattacharjee and Debasis Mitra and Bhargab B.
                 Bhattacharya",
  title =        "Robust In-Field Testing of Digital Microfluidic
                 Biochips",
  journal =      j-JETC,
  volume =       "14",
  number =       "1",
  pages =        "5:1--5:??",
  month =        mar,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3123586",
  ISSN =         "1550-4832",
  bibdate =      "Thu Nov 1 16:44:40 MDT 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Microfluidic technology offers vast promise for
                 implementing biochemistry-on-chip with diverse
                 applications to clinical diagnosis, genome analysis,
                 drug design, and point-of-care testing. Among various
                 types of fluid-chips, droplet-based digital
                 microfluidic biochips (DMFBs), which consist of a
                 patterned array of controllable electrodes, provide the
                 advantage of programmability, ease of fluidic
                 operations, and versatile droplet mobility. However,
                 because of manufacturing or field defects, electrode
                 degradation, or dielectric breakdown, these chips may
                 suffer from incorrect fluidic behavior. Reliability of
                 fluidic operations is of utmost concern in DMFBs that
                 are used to perform safety-critical bio-protocols.
                 Various methods are deployed to test these devices,
                 either offline or being overlapped with bioassay
                 operations (termed as concurrent or in-field testing).
                 The main challenge of in-field testing lies in the fact
                 that the test must run concurrently with the execution
                 of the normal assay without hampering the correctness
                 of the latter. In prior work, optimal testing for
                 droplet mobility over all electrodes was formulated in
                 terms of finding either a Hamiltonian path or a
                 Eulerian path in an undirected graph that represents
                 the electrode-adjacency structure. Although these
                 models have been studied for offline testing, no such
                 effort was made in the area of concurrent testing. In
                 this work, we propose, for in-field application, an
                 SAT-based modeling and solution approach to find an
                 optimal test plan that can be used to check droplet
                 movement across the boundary between every pair of
                 adjacent electrodes, which is visited by the droplets
                 of the ongoing assay. The proposed method is robust and
                 determines a test solution successfully regardless of
                 the cover assay that is being executed concurrently.
                 Experiments on several real-life assays and other test
                 cases demonstrate the effectiveness of the method with
                 respect to test completion time.",
  acknowledgement = ack-nhfb,
  articleno =    "5",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Yang:2018:IAC,
  author =       "Xiaokun Yang and Wujie Wen and Ming Fan",
  title =        "Improving {AES} Core Performance via an Advanced
                 {ASBUS} Protocol",
  journal =      j-JETC,
  volume =       "14",
  number =       "1",
  pages =        "6:1--6:??",
  month =        mar,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3110713",
  ISSN =         "1550-4832",
  bibdate =      "Thu Nov 1 16:44:40 MDT 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Security is becoming a de-facto requirement of
                 System-on-Chips (SoC), leading up to a significant
                 share of circuit design cost. In this article, we
                 propose an advanced SBUS protocol (ASBUS), to improve
                 the data feeding efficiency of the Advanced Encryption
                 Standard (AES) encrypted circuits. As a case study, the
                 direct memory access (DMA) combined with AES engine and
                 memory controller are implemented as our
                 design-under-test (DUT) using field-programmable gate
                 arrays (FPGA). The results show that our presented
                 ASBUS structure outperforms the AXI-based design for
                 cipher tests. As an example, the 32-bit ASBUS design
                 costs less in terms of hardware resources and achieves
                 higher throughput ($ 1.30 \times $) than the 32-bit AXI
                 implementation, and the dynamic energy consumed by the
                 ASBUS cipher test is reduced to 71.27\% compared with
                 the AXI test.",
  acknowledgement = ack-nhfb,
  articleno =    "6",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Oneal:2018:RCS,
  author =       "Kenneth O'neal and Daniel Grissom and Philip Brisk",
  title =        "Resource-Constrained Scheduling for Digital
                 Microfluidic Biochips",
  journal =      j-JETC,
  volume =       "14",
  number =       "1",
  pages =        "7:1--7:??",
  month =        mar,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3093930",
  ISSN =         "1550-4832",
  bibdate =      "Thu Nov 1 16:44:40 MDT 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Digital microfluidics based on
                 electrowetting-on-dielectric technology is poised to
                 revolutionize many aspects of chemistry and
                 biochemistry through miniaturization, automation, and
                 software programmability. Digital microfluidic biochips
                 (DMFBs) offer ample spatial parallelism, which is then
                 exposed to the compiler. The first problem that a DMFB
                 compiler must solve is resource-constrained scheduling,
                 which is NP-complete. If the compiler is applied
                 off-line, then long-running algorithms that produce
                 solutions of high quality, such as iterative
                 improvement or branch-and-bound search, can be applied;
                 in an online context, where a biochemical reaction is
                 to be executed as soon as it is specified by the
                 programmer, heuristics that sacrifice solution quality
                 to attain a fast runtime are used. This article
                 describes in detail the algorithms and heuristics that
                 have been proposed for resource-constrained scheduling,
                 focusing on several recent contributions: path
                 scheduling and force-directed list scheduling. It also
                 discusses shortcomings and limitations of existing
                 optimal scheduling problem formulations based on
                 Integer Linear Programming and presents an updated
                 formulation that addresses these issues. The algorithms
                 are compared and evaluated on an extensive benchmark
                 suite of biochemical assays used for applications, such
                 as in vitro diagnostics, protein crystallization, and
                 automated sample preparation.",
  acknowledgement = ack-nhfb,
  articleno =    "7",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Motaman:2018:IPV,
  author =       "Seyedhamidreza Motaman and Swaroop Ghosh and Jaydeep
                 Kulkarni",
  title =        "Impact of Process Variation on Self-Reference Sensing
                 Scheme and Adaptive Current Modulation for Robust
                 {STTRAM} Sensing",
  journal =      j-JETC,
  volume =       "14",
  number =       "1",
  pages =        "8:1--8:??",
  month =        mar,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3132577",
  ISSN =         "1550-4832",
  bibdate =      "Thu Nov 1 16:44:40 MDT 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Spin-Transfer-Torque RAM (STTRAM) is a promising
                 technology for high-density on-chip cache due to low
                 standby power and high speed. However, the process
                 variation of the Magnetic Tunnel Junction (MTJ) and
                 access transistor poses a serious challenge to sensing.
                 Nondestructive sensing suffers from reference
                 resistance variation, whereas destructive sensing
                 suffers from failures due to unoptimized selection of
                 data and reference currents. Furthermore, the sense
                 speed is tightly coupled with the reference/data
                 current requirement. In this work, we study the process
                 variation effect on a self-reference sensing scheme to
                 eliminate bit-to-bit process variation in MTJ
                 resistance. Read current modulation is proposed to
                 overcome the failures due to process variation.
                 Simulation results reveal $ < 0.01 \% $ failures at the
                 cost of 9ns sense time and 190$ \mu $W power
                 consumption.",
  acknowledgement = ack-nhfb,
  articleno =    "8",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Catania:2018:IEE,
  author =       "Vincenzo Catania and Andrea Mineo and Salvatore
                 Monteleone and Maurizio Palesi and Davide Patti",
  title =        "Improving Energy Efficiency in Wireless
                 Network-on-Chip Architectures",
  journal =      j-JETC,
  volume =       "14",
  number =       "1",
  pages =        "9:1--9:??",
  month =        mar,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3138807",
  ISSN =         "1550-4832",
  bibdate =      "Thu Nov 1 16:44:40 MDT 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Wireless Network-on-Chip (WiNoC) represents a
                 promising emerging communication technology for
                 addressing the scalability limitations of future
                 manycore architectures. In a WiNoC, high-latency and
                 power-hungry long-range multi-hop communications can be
                 realized by performance- and energy-efficient
                 single-hop wireless communications. However, the energy
                 contribution of such wireless communication accounts
                 for a significant fraction of the overall communication
                 energy budget. This article presents a novel energy
                 managing technique for WiNoC architectures aimed at
                 improving the energy efficiency of the main elements of
                 the wireless infrastructure, namely, radio-hubs. The
                 rationale behind the proposed technique is based on
                 selectively turning off, for the appropriate number of
                 cycles, all the radio-hubs that are not involved in the
                 current wireless communication. The proposed energy
                 managing technique is assessed on several network
                 configurations under different traffic scenarios both
                 synthetic and extracted from the execution of real
                 applications. The obtained results show that the
                 application of the proposed technique allows up to 25\%
                 total communication energy saving without any impact on
                 performance and with a negligible impact on the silicon
                 area of the radio-hub.",
  acknowledgement = ack-nhfb,
  articleno =    "9",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Li:2018:ELC,
  author =       "Bohua Li and Yukui Pei and Wujie Wen",
  title =        "Efficient {LDPC} Code Design for Combating Asymmetric
                 Errors in {STT-RAM}",
  journal =      j-JETC,
  volume =       "14",
  number =       "1",
  pages =        "10:1--10:??",
  month =        mar,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3154836",
  ISSN =         "1550-4832",
  bibdate =      "Thu Nov 1 16:44:40 MDT 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Spin-transfer torque random access memory (STT-RAM) is
                 a promising emerging memory technology in the future
                 memory hierarchy. However, its unique reliability
                 challenges, i.e., the asymmetric bit failure mechanism
                 at different bit flippings, have raised significant
                 concerns in its real applications. Recent studies even
                 show that the common memory error repair ``remedies''
                 cannot efficiently address them. In this article, we
                 for the first time systematically study the potentials
                 of the strong low-density parity-check (LDPC) code for
                 combating such unique asymmetric errors in both
                 single-level-cell (SLC) and multi-level-cell (MLC)
                 STT-RAM designs. A generic STT-RAM channel model
                 suitable for the SLC/MLC designs, is developed to
                 analytically calibrate all the accumulated asymmetric
                 factors of the write/read operations. The key initial
                 information for LDPC decoding, namely asymmetric
                 log-likelihood ratio (A-LLR), is redesigned and
                 extracted from the proposed channel model, to unleash
                 the LDPC's asymmetric error correcting capability. LDPC
                 codec is also carefully designed to lower the hardware
                 cost by leveraging the systematic-structured parity
                 check matrix. Then two customized short-length LDPC
                 codes-(585,512) and (683,512)-augmented from the
                 semi-random parity check matrix and the A-LLR based
                 asymmetric decoding, are proposed for SLC and MLC
                 STT-RAM designs, respectively. Experiments show that
                 our proposed LDPC designs can improve the STT-RAM
                 reliability by at least 10$^2$ (10$^4$ ) when compared
                 to the existing error correction codes (ECCs) for the
                 SLC (MLC) design, demonstrating the feasibility of LDPC
                 solutions on STT-RAM.",
  acknowledgement = ack-nhfb,
  articleno =    "10",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Liu:2018:OAE,
  author =       "Yu Liu and Yingyezhe Jin and Peng Li",
  title =        "Online Adaptation and Energy Minimization for Hardware
                 Recurrent Spiking Neural Networks",
  journal =      j-JETC,
  volume =       "14",
  number =       "1",
  pages =        "11:1--11:??",
  month =        mar,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3145479",
  ISSN =         "1550-4832",
  bibdate =      "Thu Nov 1 16:44:40 MDT 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "The Liquid State Machine (LSM) is a promising model of
                 recurrent spiking neural networks that provides an
                 appealing brain-inspired computing paradigm for
                 machine-learning applications such as pattern
                 recognition. Moreover, processing information directly
                 on spiking events makes the LSM well suited for cost-
                 and energy-efficient hardware implementation. In this
                 article, we systematically present three techniques for
                 optimizing energy efficiency while maintaining good
                 performance of the proposed LSM neural processors from
                 both an algorithmic and hardware implementation point
                 of view. First, to realize adaptive LSM neural
                 processors, thus boost learning performance, we propose
                 a hardware-friendly Spike-Timing Dependent Plastic
                 (STDP) mechanism for on-chip tuning. Then, the LSM
                 processor incorporates a novel runtime
                 correlation-based neuron gating scheme to minimize the
                 power dissipated by reservoir neurons. Furthermore, an
                 activity-dependent clock gating approach is presented
                 to address the energy inefficiency due to the
                 memory-intensive nature of the proposed neural
                 processors. Using two different real-world tasks of
                 speech and image recognition to benchmark, we
                 demonstrate that the proposed architecture boosts the
                 average learning performance by up to 2.0\% while
                 reducing energy dissipation by up to 29\% compared to a
                 baseline LSM with little extra hardware overhead on a
                 Xilinx Virtex-6 FPGA.",
  acknowledgement = ack-nhfb,
  articleno =    "11",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Grani:2018:SPS,
  author =       "Paolo Grani and Sandro Bartolini",
  title =        "Scalable Path-Setup Scheme for All-Optical Dynamic
                 Circuit Switched {NoCs} in Cache Coherent {CMPs}",
  journal =      j-JETC,
  volume =       "14",
  number =       "1",
  pages =        "12:1--12:??",
  month =        mar,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3154840",
  ISSN =         "1550-4832",
  bibdate =      "Thu Nov 1 16:44:40 MDT 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Nanophotonics is a promising solution for on-chip
                 interconnection due to its intrinsic low-latency and
                 low-power features, which can be useful for performance
                 and energy in future Chip Multi-Processors (CMPs). This
                 article proposes a novel arbitrated all-optical
                 path-setup scheme for tiled CMPs adopting
                 circuit-switched optical networks. It aims at
                 significantly reducing path-setup latency and overall
                 energy consumption. The proposed arbitrated scheme is
                 able to configure multiple photonic switches
                 simultaneously, instead of sequentially as it is done
                 in state-of-the-art proposals. The proposed fast
                 optical path-setup solution reduces the overhead in
                 each transmission and, most importantly, allows optical
                 circuit-switched networks to effectively serve cache
                 coherence traffic, which is mainly composed of
                 relatively small messages. Specifically, we propose a
                 single-arbiter scheme where the whole topology is
                 managed by a central module (single-arbiter) that takes
                 care of the path-setup procedures. Then, to tackle
                 scalability, we propose a logically clustered
                 architecture (multi-arbiter) in which an arbiter is
                 allocated in each logical core-cluster and an ad hoc
                 distributed reservation protocol coordinates arbiters
                 to manage inter-cluster path reservations. We show that
                 our proposed single-arbiter architecture outperforms a
                 state-of-the-art optical network with sequential
                 path-setup (optical baseline) in the case of 8- and
                 16-core tiled CMP setups. However, due to serialization
                 issues, the single-arbiter solution is not able to
                 compete with a reference electronic baseline for bigger
                 32- and 64-core setups even if still performing much
                 better than the optical baseline. Conversely, our
                 multi-arbiter hierarchical solution allows us to
                 improve performance up to almost 20\% and 40\% for 32-
                 and 64-core setups, respectively, demonstrating a wide
                 applicability of the proposed technique. Energy-wise,
                 the analyzed solutions enable significant savings
                 compared to both the optical baseline with sequential
                 path setup, and to the electronic counterpart.
                 Specifically, results show more than 25\% average
                 improvement for the single-arbiter in the 8- and
                 16-core cases, and more than 40\% and 15\% savings for
                 the multi-arbiter in the 32- and 64-core cases,
                 respectively.",
  acknowledgement = ack-nhfb,
  articleno =    "12",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{VanRynbach:2018:QCP,
  author =       "Andre {Van Rynbach} and Muhammad Ahsan and Jungsang
                 Kim",
  title =        "A Quantum Computing Performance Simulator Based on
                 Circuit Failure Probability and Fault Path Counting",
  journal =      j-JETC,
  volume =       "14",
  number =       "1",
  pages =        "13:1--13:??",
  month =        mar,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3154837",
  ISSN =         "1550-4832",
  bibdate =      "Thu Nov 1 16:44:40 MDT 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Quantum computing performance simulators are needed to
                 provide practical metrics for the effectiveness of
                 executing theoretical quantum information processing
                 protocols on physical hardware. In this work, we
                 present a tool to simulate the execution of
                 fault-tolerant quantum computation by automating the
                 tracking of common fault paths for error propagation
                 through an encoded circuit block and quantifying the
                 failure probability of each encoded qubit throughout
                 the circuit. Our simulator runs a fault path counter on
                 encoded circuit blocks to determine the probability
                 that two or more errors remain on the encoded qubits
                 after each block is executed, and it combines errors
                 from all the encoded blocks to estimate performance
                 metrics such as the logical qubit failure probability,
                 the overall circuit failure probability, the number of
                 qubits used, and the time required to run the overall
                 circuit. Our technique efficiently estimates the upper
                 bound of the error probability and provides a useful
                 measure of the error threshold at low error
                 probabilities where conventional Monte Carlo methods
                 are ineffective. We describe a way of simplifying the
                 fault-tolerant measurement process in the Steane code
                 to reduce the number of error correction steps
                 necessary. We present simulation results comparing the
                 execution of quantum adders, which constitute a major
                 part of Shor's algorithm.",
  acknowledgement = ack-nhfb,
  articleno =    "13",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Cao:2018:GEI,
  author =       "Yu Cao and Xin Li and Jae-Sun Seo and Ganesh Dasika",
  title =        "{Guest Editors}' Introduction: Frontiers of Hardware
                 and Algorithms for On-chip Learning",
  journal =      j-JETC,
  volume =       "14",
  number =       "2",
  pages =        "14:1--14:??",
  month =        jul,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3205944",
  ISSN =         "1550-4832",
  bibdate =      "Thu Nov 1 16:44:40 MDT 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  acknowledgement = ack-nhfb,
  articleno =    "14",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Kim:2018:DNN,
  author =       "Hyungjun Kim and Taesu Kim and Jinseok Kim and
                 Jae-Joon Kim",
  title =        "Deep Neural Network Optimized to Resistive Memory with
                 Nonlinear Current-Voltage Characteristics",
  journal =      j-JETC,
  volume =       "14",
  number =       "2",
  pages =        "15:1--15:??",
  month =        jul,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3145478",
  ISSN =         "1550-4832",
  bibdate =      "Thu Nov 1 16:44:40 MDT 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Artificial Neural Network computation relies on
                 intensive vector-matrix multiplications. Recently, the
                 emerging nonvolatile memory (NVM) crossbar array showed
                 a feasibility of implementing such operations with high
                 energy efficiency. Thus, there have been many works on
                 efficiently utilizing emerging NVM crossbar arrays as
                 analog vector-matrix multipliers. However, nonlinear
                 I-V characteristics of NVM restrain critical design
                 parameters, such as the read voltage and weight range,
                 resulting in substantial accuracy loss. In this
                 article, instead of optimizing hardware parameters to a
                 given neural network, we propose a methodology of
                 reconstructing the neural network itself to be
                 optimized to resistive memory crossbar arrays. To
                 verify the validity of the proposed method, we
                 simulated various neural networks with MNIST and
                 CIFAR-10 dataset using two different Resistive Random
                 Access Memory models. Simulation results show that our
                 proposed neural network produces inference accuracies
                 significantly higher than conventional neural network
                 when the network is mapped to synapse devices with
                 nonlinear I-V characteristics.",
  acknowledgement = ack-nhfb,
  articleno =    "15",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Sarwar:2018:EEN,
  author =       "Syed Shakib Sarwar and Swagath Venkataramani and
                 Aayush Ankit and Anand Raghunathan and Kaushik Roy",
  title =        "Energy-Efficient Neural Computing with Approximate
                 Multipliers",
  journal =      j-JETC,
  volume =       "14",
  number =       "2",
  pages =        "16:1--16:??",
  month =        jul,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3097264",
  ISSN =         "1550-4832",
  bibdate =      "Thu Nov 1 16:44:40 MDT 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Neural networks, with their remarkable ability to
                 derive meaning from a large volume of complicated or
                 imprecise data, can be used to extract patterns and
                 detect trends that are too complex for the von Neumann
                 computing paradigm. Their considerable computational
                 requirements stretch the capabilities of even modern
                 computing platforms. We propose an approximate
                 multiplier that exploits the inherent application
                 resilience to error and utilizes the notion of
                 computation sharing to achieve improved energy
                 consumption for neural networks. We also propose a
                 Multiplier-less Artificial Neuron (MAN), which is even
                 more compact and energy efficient. We also propose a
                 network retraining methodology to recover some of the
                 accuracy loss due to the use of these approximate
                 multipliers. We evaluated the proposed algorithm/design
                 on several recognition applications. The results show
                 that we achieve $ \approx $33\%, $ \approx $32\%, and $
                 \approx $25\% reduction in power consumption and $
                 \approx $33\%, $ \approx $34\%, and $ \approx $27\%
                 reduction in area, respectively, for 12-, 8-, and 4-bit
                 MAN, with a maximum $ \approx $2.4\% loss in accuracy
                 compared to a conventional neuron implementation of
                 equivalent bit precision. These comparisons were
                 performed under iso-speed conditions.",
  acknowledgement = ack-nhfb,
  articleno =    "16",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Ko:2018:RTL,
  author =       "Glenn G. Ko and Rob A. Rutenbar",
  title =        "Real-Time and Low-Power Streaming Source Separation
                 Using {Markov} Random Field",
  journal =      j-JETC,
  volume =       "14",
  number =       "2",
  pages =        "17:1--17:??",
  month =        jul,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3183351",
  ISSN =         "1550-4832",
  bibdate =      "Thu Nov 1 16:44:40 MDT 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Machine learning (ML) has revolutionized a wide range
                 of recognition tasks, ranging from text analysis to
                 speech to vision, most notably in cloud deployments.
                 However, mobile deployment of these ideas involves a
                 very different category of design problems. In this
                 article, we develop a hardware architecture for a sound
                 source separation task, intended for deployment on a
                 mobile phone. We focus on a novel Markov random field
                 (MRF) sound source separation algorithm that uses
                 expectation-maximization and Gibbs sampling to learn
                 MRF parameters on the fly and infer the best separation
                 of sources. The intrinsically iterative algorithm
                 suggests challenges for both speed and power. A
                 real-time streaming FPGA implementation runs at 150MHz
                 with 207KB RAM, achieves a speed-up of $ 22 \times $
                 over a software reference, performs with an SDR of up
                 to 7.021dB with 1.601ms latency, and exhibits excellent
                 perceived audio quality. A 45nm CMOS ASIC virtual
                 prototype simulated at 20MHz shows that this
                 architecture is small ({$<$10} million gates) and
                 consumes only 70mW, which is less than 2\% of the power
                 of an ARM Cortex-A9 software version. To the best of
                 our knowledge, this is the first Gibbs sampling
                 inference accelerator designed in conventional
                 FPGA/ASIC technology that targets a realistic mobile
                 perceptual application.",
  acknowledgement = ack-nhfb,
  articleno =    "17",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Li:2018:GOF,
  author =       "Yixing Li and Zichuan Liu and Kai Xu and Hao Yu and
                 Fengbo Ren",
  title =        "A {GPU}-Outperforming {FPGA} Accelerator Architecture
                 for Binary Convolutional Neural Networks",
  journal =      j-JETC,
  volume =       "14",
  number =       "2",
  pages =        "18:1--18:??",
  month =        jul,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3154839",
  ISSN =         "1550-4832",
  bibdate =      "Thu Nov 1 16:44:40 MDT 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "FPGA-based hardware accelerators for convolutional
                 neural networks (CNNs) have received attention due to
                 their higher energy efficiency than GPUs. However, it
                 is challenging for FPGA-based solutions to achieve a
                 higher throughput than GPU counterparts. In this
                 article, we demonstrate that FPGA acceleration can be a
                 superior solution in terms of both throughput and
                 energy efficiency when a CNN is trained with binary
                 constraints on weights and activations. Specifically,
                 we propose an optimized fully mapped FPGA accelerator
                 architecture tailored for bitwise convolution and
                 normalization that features massive spatial parallelism
                 with deep pipelines stages. A key advantage of the FPGA
                 accelerator is that its performance is insensitive to
                 data batch size, while the performance of GPU
                 acceleration varies largely depending on the batch size
                 of the data. Experiment results show that the proposed
                 accelerator architecture for binary CNNs running on a
                 Virtex-7 FPGA is $ 8.3 \times $ faster and $ 75 \times
                 $ more energy-efficient than a Titan X GPU for
                 processing online individual requests in small batch
                 sizes. For processing static data in large batch sizes,
                 the proposed solution is on a par with a Titan X GPU in
                 terms of throughput while delivering $ 9.5 \times $
                 higher energy efficiency.",
  acknowledgement = ack-nhfb,
  articleno =    "18",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Potok:2018:SCD,
  author =       "Thomas E. Potok and Catherine Schuman and Steven Young
                 and Robert Patton and Federico Spedalieri and Jeremy
                 Liu and Ke-Thia Yao and Garrett Rose and Gangotree
                 Chakma",
  title =        "A Study of Complex Deep Learning Networks on
                 High-Performance, Neuromorphic, and Quantum Computers",
  journal =      j-JETC,
  volume =       "14",
  number =       "2",
  pages =        "19:1--19:??",
  month =        jul,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3178454",
  ISSN =         "1550-4832",
  bibdate =      "Thu Nov 1 16:44:40 MDT 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Current deep learning approaches have been very
                 successful using convolutional neural networks trained
                 on large graphical-processing-unit-based computers.
                 Three limitations of this approach are that (1) they
                 are based on a simple layered network topology, i.e.,
                 highly connected layers, without intra-layer
                 connections; (2) the networks are manually configured
                 to achieve optimal results, and (3) the implementation
                 of the network model is expensive in both cost and
                 power. In this article, we evaluate deep learning
                 models using three different computing architectures to
                 address these problems: quantum computing to train
                 complex topologies, high performance computing to
                 automatically determine network topology, and
                 neuromorphic computing for a low-power hardware
                 implementation. We use the MNIST dataset for our
                 experiment, due to input size limitations of current
                 quantum computers. Our results show the feasibility of
                 using the three architectures in tandem to address the
                 above deep learning limitations. We show that a quantum
                 computer can find high quality values of intra-layer
                 connection weights in a tractable time as the
                 complexity of the network increases, a high performance
                 computer can find optimal layer-based topologies, and a
                 neuromorphic computer can represent the complex
                 topology and weights derived from the other
                 architectures in low power memristive hardware.",
  acknowledgement = ack-nhfb,
  articleno =    "19",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Xu:2018:SPC,
  author =       "Jiang Xu and Yuichi Nakamura and Andrew Kahng",
  title =        "Silicon Photonics for Computing Systems",
  journal =      j-JETC,
  volume =       "14",
  number =       "2",
  pages =        "20:1--20:??",
  month =        jul,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3208198",
  ISSN =         "1550-4832",
  bibdate =      "Thu Nov 1 16:44:40 MDT 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  acknowledgement = ack-nhfb,
  articleno =    "20",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Zhang:2018:LBT,
  author =       "Zhe Zhang and Yaoyao Ye",
  title =        "A Learning-Based Thermal-Sensitive Power Optimization
                 Approach for Optical {NoCs}",
  journal =      j-JETC,
  volume =       "14",
  number =       "2",
  pages =        "21:1--21:??",
  month =        jul,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3173468",
  ISSN =         "1550-4832",
  bibdate =      "Thu Nov 1 16:44:40 MDT 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Optical networks-on-chip (NoCs) based on silicon
                 photonics have been proposed as emerging on-chip
                 communication architectures for chip multiprocessors
                 with large core counts. However, due to the thermal
                 sensitivity of optical devices used in optical NoCs,
                 on-chip temperature variations cause significant
                 thermal-induced optical power loss, which would
                 counteract the power advantages of optical NoCs. To
                 tackle this problem, in this work, we propose a
                 learning-based thermal-sensitive power optimization
                 approach for mesh- or torus-based optical NoCs in the
                 presence of temperature variations. The key techniques
                 proposed include an initial device-setting and
                 thermal-tuning mechanism that is a device-level
                 optimization technique, and a learning-based
                 thermal-sensitive adaptive routing algorithm that is a
                 network-level optimization technique. Simulation
                 results of an 8x8 mesh-based optical NoC show that the
                 proposed initial device-setting and thermal-tuning
                 mechanism confines the worst-case thermal-induced
                 optical energy consumption to be on the order of tens
                 of pJ/bit, by avoiding significant thermal-induced
                 optical power loss caused by temperature-dependent
                 wavelength shifts. Besides, it shows that the
                 learning-based thermal-sensitive adaptive routing
                 algorithm is able to find an optimal path with the
                 minimum estimated thermal-induced optical power
                 consumption for each communication pair. The proposed
                 routing has a greater space for optimization,
                 especially for applications with more long-distance
                 traffic.",
  acknowledgement = ack-nhfb,
  articleno =    "21",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Xu:2018:PVT,
  author =       "Yi Xu and Jun Yang and Rami Melhem",
  title =        "A Process-Variation-Tolerant Method for Nanophotonic
                 On-Chip Network",
  journal =      j-JETC,
  volume =       "14",
  number =       "2",
  pages =        "22:1--22:??",
  month =        jul,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3208073",
  ISSN =         "1550-4832",
  bibdate =      "Thu Nov 1 16:44:40 MDT 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Nanophotonic networks, a potential candidate for
                 future networks on-chip, have been challenged for their
                 reliability due to several device-level limitations.
                 One of the main issues is that fabrication errors
                 (a.k.a. process variations) can cause devices to
                 malfunction, rendering communication unreliable. For
                 example, the microring resonator, a preferred optical
                 modulator device, may not resonate at the designated
                 wavelength under process variations (PVs), leading to
                 communication errors and bandwidth loss. This article
                 proposes a series of solutions to the wavelength
                 drifting problem of microrings and subsequent bandwidth
                 loss problem of an optical network, due to PVs. The
                 objective is to maximize network bandwidth through
                 proper arrangement among microrings and wavelengths
                 with minimum power requirements. Our arrangement,
                 called ``MinTrim,'' solves this problem using simple
                 integer linear programming, adding supplementary
                 microrings, and allowing flexible assignment of
                 wavelengths to network nodes as long as the resulting
                 network presents maximal bandwidth. Each step is shown
                 to improve bandwidth provisioning with lower power
                 requirements. Evaluations on a sample network show that
                 a baseline network could lose more than 40\% bandwidth
                 due to PVs. Such loss can be recovered by MinTrim to
                 produce a network with 98.4\% working bandwidth. In
                 addition, the power required for arranging microrings
                 is 39\% lower than the baseline. Therefore, MinTrim
                 provides an efficient PV-tolerant solution to improving
                 the reliability of on-chip photonics.",
  acknowledgement = ack-nhfb,
  articleno =    "22",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Fusella:2018:RPC,
  author =       "Edoardo Fusella and Alessandro Cilardo",
  title =        "Reducing Power Consumption of Lasers in Photonic
                 {NoCs} through Application-Specific Mapping",
  journal =      j-JETC,
  volume =       "14",
  number =       "2",
  pages =        "23:1--23:??",
  month =        jul,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3173463",
  ISSN =         "1550-4832",
  bibdate =      "Thu Nov 1 16:44:40 MDT 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "To face the complex communication problems that arise
                 as the number of on-chip components grows up, photonic
                 networks-on-chip (NoCs) have been recently proposed to
                 replace electronic interconnects. However, photonic
                 NoCs lack efficient laser sources, possibly resulting
                 in an inefficient or inoperable architecture. In this
                 article, we introduce a methodology for the design
                 space exploration of optical NoC mapping solutions,
                 which automatically assigns IPs/cores to the network
                 tiles such that the laser power consumption is
                 minimized. The experimental evaluation shows average
                 reductions of 34.7\% and 27.3\% in the power
                 consumption compared to, respectively,
                 application-oblivious and randomly mapped photonic
                 NoCs, allowing improved energy efficiency.",
  acknowledgement = ack-nhfb,
  articleno =    "23",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Luo:2018:OOW,
  author =       "Jiating Luo and Cedric Killian and Sebastien {Le Beux}
                 and Daniel Chillet and Olivier Sentieys and Ian
                 O'Connor",
  title =        "Offline Optimization of Wavelength Allocation and
                 Laser Power in Nanophotonic Interconnects",
  journal =      j-JETC,
  volume =       "14",
  number =       "2",
  pages =        "24:1--24:??",
  month =        jul,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3178453",
  ISSN =         "1550-4832",
  bibdate =      "Thu Nov 1 16:44:40 MDT 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "Optical Network-on-Chip (ONoC) is a promising
                 communication medium for large-scale multiprocessor
                 systems-on-chips. Indeed, ONoC can outperform classical
                 electrical NoCs in terms of energy efficiency and
                 bandwidth density, in particular, because this medium
                 can support multiple transactions at the same time on
                 different wavelengths by using Wavelength Division
                 Multiplexing (WDM). However, multiple signals sharing
                 simultaneously the same part of a waveguide can lead to
                 inter-channel crosstalk noise. This problem impacts the
                 signal-to-noise ratio of the optical signals, which
                 leads to an increase in the Bit Error Rate (BER) at the
                 receiver side. If a specific BER is targeted, an
                 increase of laser power should be necessary to satisfy
                 the SNR. In this context, an important issue is to
                 evaluate the laser power needed to satisfy the various
                 desired communication bandwidths based on the BER
                 performance requirements. In this article, we propose
                 an off-line approach that concurrently optimizes the
                 laser power scaling and execution time of a global
                 application. A set of different levels of power is
                 introduced for each laser, to ensure that optical
                 signals can be emitted with just-enough power to ensure
                 targeted BER. As a result, most promising solutions are
                 highlighted for mapping a defined application onto a
                 16-core ring-based WDM ONoC.",
  acknowledgement = ack-nhfb,
  articleno =    "24",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Vanwinkle:2018:SSH,
  author =       "Scott Vanwinkle and Avinash Karanth Kodi",
  title =        "{SHARP}: Shared Heterogeneous Architecture with
                 Reconfigurable Photonic Network-on-Chip",
  journal =      j-JETC,
  volume =       "14",
  number =       "2",
  pages =        "25:1--25:??",
  month =        jul,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3185383",
  ISSN =         "1550-4832",
  bibdate =      "Thu Nov 1 16:44:40 MDT 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
  abstract =     "As the relentless quest for higher throughput and
                 lower energy cost continues in heterogeneous
                 multicores, there is a strong demand for
                 energy-efficient and high-performance Network-on-Chip
                 (NoC) architectures. Heterogeneous architectures that
                 can simultaneously utilize both the serialized nature
                 of the CPU as well as the thread level parallelism of
                 the GPU are gaining traction in the industry. A
                 critical issue with heterogeneous architectures is
                 finding an optimal way to utilize the shared resources
                 such as the last level cache and NoC without hindering
                 the performance of either the CPU or the GPU core.
                 Photonic interconnects are a disruptive technology
                 solution that has the potential to increase the
                 bandwidth, reduce latency, and improve
                 energy-efficiency over traditional metallic
                 interconnects. In this article, we propose a CPU-GPU
                 heterogeneous architecture called Shared Heterogeneous
                 Architecture with Reconfigurable Photonic
                 Network-on-Chip (SHARP) that clusters CPU and GPU cores
                 around the same router and dynamically allocates
                 bandwidth between the CPU and GPU cores based on
                 application demands. The SHARP architecture is designed
                 as a Single-Writer Multiple-Reader (SWMR) crossbar with
                 reservation-assist to connect CPU/GPU cores that
                 dynamically reallocates bandwidth using buffer
                 utilization information at runtime. As network traffic
                 exhibits temporal and spatial fluctuations due to
                 application behavior, SHARP can dynamically reallocate
                 bandwidth and thereby adapt to application demands.
                 SHARP demonstrates 34\% performance (throughput)
                 improvement over a baseline electrical CMESH while
                 consuming 25\% less energy per bit. Simulation results
                 have also shown 6.9\% to 14.9\% performance improvement
                 over other flavors of the proposed SHARP architecture
                 without dynamic bandwidth allocation.",
  acknowledgement = ack-nhfb,
  articleno =    "25",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J967",
}

@Article{Ishihara:2018:INP,
  author =       "Tohru Ishihara and Akihiko Shinya and