@Preamble{"\input bibnames.sty"
# "\ifx \undefined \circled \def \circled #1{(#1)} \fi"
# "\ifx \undefined \pkg \def \pkg #1{{{\tt #1}}} \fi"
# "\ifx \undefined \reg \def \reg {\circled{R}} \fi"
}
@String{ack-nhfb = "Nelson H. F. Beebe,
University of Utah,
Department of Mathematics, 110 LCB,
155 S 1400 E RM 233,
Salt Lake City, UT 84112-0090, USA,
Tel: +1 801 581 5254,
FAX: +1 801 581 4148,
e-mail: \path|beebe@math.utah.edu|,
\path|beebe@acm.org|,
\path|beebe@computer.org| (Internet),
URL: \path|https://www.math.utah.edu/~beebe/|"}
@String{j-JETC = "ACM Journal on Emerging Technologies
in Computing Systems (JETC)"}
@Article{Irwin:2005:E,
author = "Mary Jane Irwin and Vijaykrishnan Narayanan",
title = "Editorial",
journal = j-JETC,
volume = "1",
number = "1",
pages = "1--6",
month = apr,
year = "2005",
CODEN = "????",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Sat Sep 17 15:29:54 MDT 2005",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Narendra:2005:CDC,
author = "Siva G. Narendra",
title = "Challenges and design choices in nanoscale {CMOS}",
journal = j-JETC,
volume = "1",
number = "1",
pages = "7--49",
month = apr,
year = "2005",
CODEN = "????",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Sat Sep 17 15:29:54 MDT 2005",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Lim:2005:PPB,
author = "Sung Kyu Lim and Ramprasad Ravichandran and Mike
Niemier",
title = "Partitioning and placement for buildable {QCA}
circuits",
journal = j-JETC,
volume = "1",
number = "1",
pages = "50--72",
month = apr,
year = "2005",
CODEN = "????",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Sat Sep 17 15:29:54 MDT 2005",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Gojman:2005:EDS,
author = "Benjamin Gojman and Eric Rachlin and John E. Savage",
title = "Evaluation of design strategies for stochastically
assembled nanoarray memories",
journal = j-JETC,
volume = "1",
number = "2",
pages = "73--108",
year = "2005",
CODEN = "????",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Sat Sep 17 15:29:54 MDT 2005",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Dehon:2005:NBP,
author = "Andr{\'e} Dehon",
title = "Nanowire-based programmable architectures",
journal = j-JETC,
volume = "1",
number = "2",
pages = "109--162",
year = "2005",
CODEN = "????",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Sat Sep 17 15:29:54 MDT 2005",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Huang:2005:TBQ,
author = "J. Huang and M. Momenzadeh and L. Schiano and M.
Ottavi and F. Lombardi",
title = "Tile-based {QCA} design using majority-like logic
primitives",
journal = j-JETC,
volume = "1",
number = "3",
pages = "163--185",
month = oct,
year = "2005",
CODEN = "????",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Tue Mar 7 16:16:02 MST 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Chakrabarty:2005:DAM,
author = "Krishnendu Chakrabarty and Jun Zeng",
title = "Design automation for microfluidics-based biochips",
journal = j-JETC,
volume = "1",
number = "3",
pages = "186--223",
month = oct,
year = "2005",
CODEN = "????",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Tue Mar 7 16:16:02 MST 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Patwardhan:2006:NNS,
author = "Jaidev P. Patwardhan and Chris Dwyer and Alvin R.
Lebeck and Daniel J. Sorin",
title = "{NANA}: a nano-scale active network architecture",
journal = j-JETC,
volume = "2",
number = "1",
pages = "1--30",
month = jan,
year = "2006",
CODEN = "????",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Mon Aug 28 07:08:02 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{VanMeter:2006:AIQ,
author = "Rodney {Van Meter} and Mark Oskin",
title = "Architectural implications of quantum computing
technologies",
journal = j-JETC,
volume = "2",
number = "1",
pages = "31--63",
month = jan,
year = "2006",
CODEN = "????",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Mon Aug 28 07:08:02 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Xie:2006:DSE,
author = "Yuan Xie and Gabriel H. Loh and Bryan Black and Kerry
Bernstein",
title = "Design space exploration for {$3$D} architectures",
journal = j-JETC,
volume = "2",
number = "2",
pages = "65--103",
month = apr,
year = "2006",
CODEN = "????",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Mon Aug 28 07:08:02 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Su:2006:YER,
author = "Fei Su and Krishnendu Chakrabarty",
title = "Yield enhancement of reconfigurable
microfluidics-based biochips using interstitial
redundancy",
journal = j-JETC,
volume = "2",
number = "2",
pages = "104--128",
month = apr,
year = "2006",
CODEN = "????",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Mon Aug 28 07:08:02 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Savage:2006:RAN,
author = "John E. Savage and Eric Rachlin and Andr{\'e} DeHon
and Charles M. Lieber and Yue Wu",
title = "Radial addressing of nanowires",
journal = j-JETC,
volume = "2",
number = "2",
pages = "129--154",
month = apr,
year = "2006",
CODEN = "????",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Mon Aug 28 07:08:02 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Massoud:2006:MDC,
author = "Yehia Massoud and Arthur Nieuwoudt",
title = "Modeling and design challenges and solutions for
carbon nanotube-based interconnect in future high
performance integrated circuits",
journal = j-JETC,
volume = "2",
number = "3",
pages = "155--196",
month = jul,
year = "2006",
CODEN = "????",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Thu Nov 16 18:25:43 MST 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Tahoori:2006:AID,
author = "Mehdi B. Tahoori",
title = "Application-independent defect tolerance of
reconfigurable nanoarchitectures",
journal = j-JETC,
volume = "2",
number = "3",
pages = "197--218",
month = jul,
year = "2006",
CODEN = "????",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Thu Nov 16 18:25:43 MST 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Datta:2006:ADF,
author = "Kushal Datta and Arindam Mukherjee and Arun
Ravindran",
title = "Automated design flow for diode-based nanofabrics",
journal = j-JETC,
volume = "2",
number = "3",
pages = "219--241",
month = jul,
year = "2006",
CODEN = "????",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Thu Nov 16 18:25:43 MST 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
acknowledgement = ack-nhfb,
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Ottavi:2006:HHE,
author = "Marco Ottavi and Luca Schiano and Fabrizio Lombardi
and Douglas Tougaw",
title = "{HDLQ}: {A HDL} environment for {QCA} design",
journal = j-JETC,
volume = "2",
number = "4",
pages = "243--261",
month = oct,
year = "2006",
CODEN = "????",
DOI = "https://doi.org/10.1145/1216396.1216397",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Fri Jun 20 11:03:17 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Emerging technologies have attracted a substantial
interest in overcoming the physical limitations of CMOS
as projected at the end of the Technology Roadmap;
among these technologies, quantum-dot cellular automata
(QCA) relies on different and novel paradigms to
implement dense, low power circuits and systems for
high-performance computing. As applicable to existing
technologies, a hierarchical process can be utilized to
facilitate the design of QCA circuits. Tools and
methodologies both at system and physical levels are
required to support all design phases. This article
presents an HDL model to describe QCA ``devices'' (also
referred elsewhere in the technical literature as
building blocks, i.e., majority voter, inverter, wire,
crossover) and facilitate the evaluation of their
design. This tool, referred to as HDLQ, allows a
designer to verify the logic characteristics of a QCA
system, while supporting within a design environment
different operational mechanisms (such as fault
injection) and the unique features of QCA (such as
bidirectionality and timing/clocking partitioning). The
applicability of this design environment to various
memory circuits for logic and timing verification is
presented in detail. Various defective conditions for
kinks due to thermodynamic effects and permanent faults
due to manufacturing defects are considered for
injection.",
acknowledgement = ack-nhfb,
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
keywords = "CAD; fault injection; HDL; QCA",
}
@Article{Davids:2006:MFD,
author = "Daniel Davids and Siddhartha Datta and Arindam
Mukherjee and Bharat Joshi and Arun Ravindran",
title = "Multiple fault diagnosis in digital microfluidic
biochips",
journal = j-JETC,
volume = "2",
number = "4",
pages = "262--276",
month = oct,
year = "2006",
CODEN = "????",
DOI = "https://doi.org/10.1145/1216396.1216398",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Fri Jun 20 11:03:17 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Microfluidics-based biochips consist of microfluidic
arrays on rigid substrates through which, movement of
fluids is tightly controlled to facilitate biological
reactions. Biochips are soon expected to revolutionize
biosensing, clinical diagnostics, and drug discovery.
Critical to the deployment of biochips in such diverse
areas is the dependability of these systems. Thus,
robust testing techniques are required to ensure an
adequate level of system dependability. Due to the
underlying mixed technology and energy domains, such
biochips exhibit unique failure mechanisms and defects.
In this article we present a highly effective fault
diagnosis strategy that uses a single source and sink
to detect and locate multiple faults in a microfluidic
array, without flooding the array, a problem that has
hampered realistic implementations of all existing
strategies. The strategy renders itself well for a
built-in self-test that could drastically reduce the
operating cost of microfluidic biochips. It can be used
during both the manufacturing phase of the biochip, as
well as field operation. Furthermore, the algorithm can
pinpoint the actual fault, as opposed to merely the
faulty regions that are typically identified by
strategies proposed in the literature. Also, analytical
results suggest that it is an effective strategy that
can be used to design highly dependable biochip
systems.",
acknowledgement = ack-nhfb,
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
keywords = "droplet flooding; faults tolerance; Microfluidic
biochip; multiple fault; testing",
}
@Article{Prasad:2006:DSA,
author = "Aditya K. Prasad and Vivek V. Shende and Igor L.
Markov and John P. Hayes and Ketan N. Patel",
title = "Data structures and algorithms for simplifying
reversible circuits",
journal = j-JETC,
volume = "2",
number = "4",
pages = "277--293",
month = oct,
year = "2006",
CODEN = "????",
DOI = "https://doi.org/10.1145/1216396.1216399",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Fri Jun 20 11:03:17 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Reversible logic is motivated by low-power design,
quantum circuits, and nanotechnology. We develop a
compact representation of small reversible circuits to
generate and store optimal circuits for all 40,320
three-input reversible functions, and millions of
four-input circuits. This allows implementing a
function optimally in constant time for use in the
peephole optimization of larger circuits produced by
existing techniques, and guarantees that every
three-bit subcircuit is optimal. To generate
subcircuits, we use a graph-based data structure and
algorithms for circuit restructuring. Finally, we
demonstrate a suboptimal circuit for which peephole
optimization fails.",
acknowledgement = ack-nhfb,
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
keywords = "circuit libraries; Circuit simplification; optimal
subcircuit",
}
@Article{Zhao:2007:PTM,
author = "Wei Zhao and Yu Cao",
title = "Predictive technology model for nano-{CMOS} design
exploration",
journal = j-JETC,
volume = "3",
number = "1",
pages = "1:1--1:??",
month = apr,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1145/1229175.1229176",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Fri Jun 20 11:03:25 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "A predictive MOSFET model is critical for early
circuit design research. In this work, a new generation
of Predictive Technology Model (PTM) is developed,
covering emerging physical effects and alternative
structures, such as the double-gate device (i.e.,
FinFET). Based on physical models and early stage
silicon data, PTM of bulk and double-gate devices are
successfully generated from 130nm to 32nm technology
nodes, with effective channel length down to 13nm. By
tuning only ten primary parameters, PTM can be easily
customized to cover a wide range of process
uncertainties. The accuracy of PTM predictions is
comprehensively verified with published silicon data:
the error of the current is below 10\\% for both NMOS
and PMOS. Furthermore, the new PTM correctly captures
process sensitivities in the nanometer regime. PTM is
available online at http://www.eas.asu.edu/~ptm.",
acknowledgement = ack-nhfb,
articleno = "1",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
keywords = "early design exploration; FinFET; predictive modeling;
process variations; Technology scaling",
}
@Article{Schulhof:2007:SRC,
author = "Gabriel Schulhof and Konrad Walus and Graham A.
Jullien",
title = "Simulation of random cell displacements in {QCA}",
journal = j-JETC,
volume = "3",
number = "1",
pages = "2:1--2:??",
month = apr,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1145/1229175.1229177",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Fri Jun 20 11:03:25 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "We analyze the behavior of quantum-dot cellular
automata (QCA) building blocks in the presence of
random cell displacements. The QCA cells are modeled
using the coherence vector description and simulated
using QCADesigner. We evaluate various fundamental
circuits: the wire, the inverter, the majority gate,
and the two-wire crossing approaches: the coplanar
crossover and the multilayer crossover. Our results
show that different building blocks have different
displacement tolerances. The coplanar crossover and
inverter perform the weakest. The wire is the most
robust. We have found displacement tolerances to be a
function of circuit layout and geometry rather than
cell size.",
acknowledgement = ack-nhfb,
articleno = "2",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
keywords = "fabrication variances; fault tolerance; QCA;
Quantum-dot cellular automata",
}
@Article{Rose:2007:DCM,
author = "Garrett S. Rose and Yuxing Yao and James M. Tour and
Adam C. Cabe and Nadine Gergel-Hackett and Nabanita
Majumdar and John C. Bean and Lloyd R. Harriott and
Mircea R. Stan",
title = "Designing {CMOS}\slash molecular memories while
considering device parameter variations",
journal = j-JETC,
volume = "3",
number = "1",
pages = "3:1--3:??",
month = apr,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1145/1229175.1229178",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Fri Jun 20 11:03:25 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "In recent years, many advances have been made in the
development of molecular scale devices. Experimental
data shows that these devices have potential for use in
both memory and logic. This article describes the
challenges faced in building crossbar array-based
molecular memory and develops a methodology to optimize
molecular scale architectures based on experimental
device data taken at room temperature. In particular,
issues in reading and writing such as memory using CMOS
are discussed, and a solution is introduced for easily
reading device conductivity states (typically
characterized by very small currents). Additionally, a
metric is derived to determine the voltages for writing
to the crossbar array. The proposed memory design is
also simulated with consideration to device parameter
variations. Thus, the results presented here shed light
on important design choices to be made at multiple
abstraction levels, from devices to architectures.
Simulation results, incorporating experimental device
data, are presented using Cadence Spectre.",
acknowledgement = ack-nhfb,
articleno = "3",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
keywords = "CMOS; molecular electronics; nanotechnology",
}
@Article{McKee:2007:ESI,
author = "Sally A. McKee",
title = "Editorial to special issue on reliable computing",
journal = j-JETC,
volume = "3",
number = "2",
pages = "4:1--4:??",
month = jul,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1145/1265949.1265950",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Fri Jun 20 11:03:32 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
acknowledgement = ack-nhfb,
articleno = "4",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Eshaghian-Wilner:2007:SWN,
author = "Mary M. Eshaghian-Wilner and Alex Khitun and Shiva
Navab and Kang L. Wang",
title = "The spin-wave nanoscale reconfigurable mesh and the
labeling problem",
journal = j-JETC,
volume = "3",
number = "2",
pages = "5:1--5:??",
month = jul,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1145/1265949.1265951",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Fri Jun 20 11:03:32 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "In this article, we present a nanoscale reconfigurable
mesh which is interconnected by ferromagnetic spin-wave
buses. In this architecture, unlike the traditional
spin-based nano structures which transmit charge, waves
are transmitted. As a result, the power consumption of
the proposed modules can be low. This reconfigurable
mesh, while requiring the same number of switches and
buses as the standard reconfigurable mesh, is capable
of simultaneously transmitting $N$ waves on each of the
spin-wave buses. Because of this highly parallel
feature, very fast and fault-tolerant algorithms can be
designed. To illustrate the superior performance of the
proposed spin-wave reconfigurable mesh, we present
three fast labeling algorithms.",
acknowledgement = ack-nhfb,
articleno = "5",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
keywords = "image processing; nanoscale architectures;
reconfigurable mesh; Spin waves",
}
@Article{Prodan:2007:DDE,
author = "Lucian Prodan and Mihai Udrescu and Oana Boncalo and
Mircea Vladutiu",
title = "Design for dependability in emerging technologies",
journal = j-JETC,
volume = "3",
number = "2",
pages = "6:1--6:??",
month = jul,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1145/1265949.1265952",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Fri Jun 20 11:03:32 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "As current microelectronics will reach their physical
limits within the foreseeable future, emerging
technologies may offer a solution for maintaining the
trends to increase computing performance.
Biologically-inspired and quantum computing represent
two emerging technology vectors for novel computing
architectures within nanoelectronics. However,
potential benefits will come at the cost of increased
device sensitivity to the surrounding environment. This
article provides a dependability perspective over these
technologies from a designer's standpoint. Maintaining
or increasing the dependability of unconventional
computational processes is discussed in two different
contexts, a bio-inspired computing architecture (the
Embryonics project) and a quantum computational
architecture (the QUERIST project).",
acknowledgement = ack-nhfb,
articleno = "6",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
keywords = "bio-inspired computing; bio-inspired digital design;
Dependability; Embryonics; emerging technologies;
evolvable hardware; fault-tolerance assessment; quantum
computing; reliability",
}
@Article{Tyrrell:2007:ED,
author = "Andy M. Tyrrell and Andrew J. Greensted",
title = "Evolving dependability",
journal = j-JETC,
volume = "3",
number = "2",
pages = "7:1--7:??",
month = jul,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1145/1265949.1265953",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Fri Jun 20 11:03:32 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Evolvable hardware offers much for the future of
complex systems design. Evolutionary techniques not
only have the potential for larger solution space
coverage, but when implemented on hardware, also allow
system designs to adapt to changes in the environment,
including failures in system components. This article
reviews a number of novel techniques, all based in the
field of bio-inspired systems, that provide varying
degrees of dependability over and above standard
designs. In particular, three different techniques are
considered: using FPGAs and ideas from developmental
biology to create designs that possess emergent
fault-tolerant properties, using FPGAs and continuous
evolution to circumvent faults as and when they occur,
and, finally, we consider a novel ASIC designed and
built with bio-inspired systems in mind.",
acknowledgement = ack-nhfb,
articleno = "7",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
keywords = "bio-inspired architectures; Evolutionary algorithms;
fault tolerance; RISA architecture",
}
@Article{Sekanina:2007:EFR,
author = "Luk{\'a}{\v{s}} Sekanina",
title = "Evolutionary functional recovery in virtual
reconfigurable circuits",
journal = j-JETC,
volume = "3",
number = "2",
pages = "8:1--8:??",
month = jul,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1145/1265949.1265954",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Fri Jun 20 11:03:32 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "A virtual reconfigurable circuit (VRC) is a
domain-specific reconfigurable device developed using
an ordinary FPGA in order to easily implement evolvable
hardware applications. While a fast partial runtime
reconfiguration and application-specific programmable
elements represent the main advantages of VRC, the main
disadvantage of the VRC is the area consumed. This
study describes experiments conducted to estimate how
the use of VRC influences the dependability of
FPGA-based evolvable systems. It is shown that these
systems are not as sensitive to faults as their
area-demanding implementations might suggest. An
evolutionary algorithm is utilized to design fault
tolerant circuits as well as to perform an automatic
functional recovery when faults are detected in the
configuration memory of the FPGA. All the experiments
are performed on models of reconfigurable devices.",
acknowledgement = ack-nhfb,
articleno = "8",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
keywords = "Dependability; evolutionary algorithms; evolvable
hardware; FPGA",
}
@Article{Tempesti:2007:SRH,
author = "Gianluca Tempesti and Daniel Mange and Pierre-Andre
Mudry and Jo{\"e}l Rossier and Andre Stauffer",
title = "Self-replicating hardware for reliability: {The
Embryonics Project}",
journal = j-JETC,
volume = "3",
number = "2",
pages = "9:1--9:??",
month = jul,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1145/1265949.1265955",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Fri Jun 20 11:03:32 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "The multicellular structure of biological organisms
and the interpretation in each of their cells of a
chemical program (the DNA string or genome ) is the
source of inspiration for the Embryonics (embryonic
electronics) project, whose final objective is the
design of highly robust integrated circuits, endowed
with properties usually associated with the living
world: self-repair and self-replication. In this
article, we provide an overview of our latest research
in the domain of the self-replication of processing
elements within a programmable logic substrate, a key
prerequisite for achieving system-level fault tolerance
in our bio-inspired approach.",
acknowledgement = ack-nhfb,
articleno = "9",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
keywords = "Bio-inspired architectures; embryonic electronics;
growth; hierarchical fault tolerance; self-repair;
self-replication",
}
@Article{Patwardhan:2007:SOD,
author = "Jaidev Patwardhan and Chris Dwyer and Alvin R.
Lebeck",
title = "A self-organizing defect tolerant {SIMD}
architecture",
journal = j-JETC,
volume = "3",
number = "2",
pages = "10:1--10:??",
month = jul,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1145/1265949.1265956",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Fri Jun 20 11:03:32 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "The continual decrease in transistor size (through
either scaled CMOS or emerging nanotechnologies)
promises to usher in an era of tera to peta-scale
integration but with increasing defects. Regardless of
fabrication methodology (top-down or bottom-up),
defect-tolerant architectures are necessary to exploit
the full potential of future increased device
densities.\par
This article explores a defect-tolerant SIMD
architecture (SOSA) that self-organizes a large number
of limited capability nodes with high defect rates into
SIMD processing elements. Simulation results show that
SOSA matches or exceeds the performance of conventional
systems for moderate to large problems, but with lower
power density.",
acknowledgement = ack-nhfb,
articleno = "10",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
keywords = "bit-serial; data parallel; defect tolerance; DNA;
nanocomputing; Self-organizing; SIMD",
}
@Article{Chakrabarty:2007:ESI,
author = "Krishnendu Chakrabarty and Sachin Sapatnekar",
title = "Editorial to special issue {DAC 2006}",
journal = j-JETC,
volume = "3",
number = "3",
pages = "11:1--11:??",
month = nov,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1145/1295231.1295232",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Fri Jun 20 11:03:49 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
acknowledgement = ack-nhfb,
articleno = "11",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Paul:2007:PBC,
author = "Bipul C. Paul and Shinobu Fujita and Masaki Okajima
and Thomas Lee",
title = "Prospect of ballistic {CNFET} in high performance
applications: {Modeling} and analysis",
journal = j-JETC,
volume = "3",
number = "3",
pages = "12:1--12:??",
month = nov,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1145/1295231.1295233",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Fri Jun 20 11:03:49 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "With the advent of carbon nanotube technology,
evaluating circuit and system performance using these
devices is becoming extremely important. In this
article, we present a quasi-analytical device model for
intrinsic ballistic CNFET, which can be used in any
conventional circuit simulator like SPICE. This simple
quasi-analytical model is effective in a wide variety
of CNFET structures as well as for a wide range of
operating conditions in the digital circuit application
domain. We also provide insight into how the parasitic
fringe capacitance in state-of-the-art CNFET geometries
impacts the overall performance of CNFET circuits. We
show that unless the device width can be significantly
reduced, the effective gate capacitance of CNFET will
be strongly dominated by the parasitic fringe
capacitances, and the superior performance of intrinsic
CNFET over silicon MOSFET cannot be achieved in
circuit. We further show that unlike conventional
MOSFET, nanotube FETs are significantly less sensitive
to many process parameter variations due to their
inherent device structures and cylindrical gate
geometry.",
acknowledgement = ack-nhfb,
articleno = "12",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
keywords = "Ballistic carbon nanotube FET (CNFET); circuit
compatible model; circuit performance; parasitic
capacitance; process variability",
}
@Article{Yuh:2007:PDT,
author = "Ping-Hung Yuh and Chia-Lin Yang and Yao-Wen Chang",
title = "Placement of defect-tolerant digital microfluidic
biochips using the {$T$}-tree formulation",
journal = j-JETC,
volume = "3",
number = "3",
pages = "13:1--13:??",
month = nov,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1145/1295231.1295234",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Fri Jun 20 11:03:49 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Droplet-based microfluidic biochips have recently
gained much attention and are expected to revolutionize
the biological laboratory procedures. As biochips are
adopted for the complex procedures in molecular
biology, its complexity is expected to increase due to
the need of multiple and concurrent assays on a chip.
In this article, we formulate the placement problem of
digital microfluidic biochips with a tree-based
topological representation, called $T$-tree. To the
best knowledge of the authors, this is the first work
that adopts a topological representation to solve the
placement problem of digital microfluidic biochips. We
also consider the defect tolerant issue to avoid to use
defective cells due to fabrication. Experimental
results demonstrate that our approach is more efficient
and effective than the previous unified synthesis and
placement framework.",
acknowledgement = ack-nhfb,
articleno = "13",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
keywords = "biochip; Microfluidics; placement",
}
@Article{Xu:2007:ADP,
author = "Tao Xu and William L. Hwang and Fei Su and Krishnendu
Chakrabarty",
title = "Automated design of pin-constrained digital
microfluidic biochips under droplet-interference
constraints",
journal = j-JETC,
volume = "3",
number = "3",
pages = "14:1--14:??",
month = nov,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1145/1295231.1295235",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Fri Jun 20 11:03:49 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Microfluidics-based biochips, also referred to as
lab-on-a-chip, are devices that integrate
fluid-handling functions such as sample preparation,
analysis, separation, and detection. This emerging
technology combines electronics with biology to open
new application areas such as point-of-care diagnosis,
on-chip DNA analysis, and automated drug discovery. We
propose a design automation method for pin-constrained
biochips that manipulate nanoliter volumes of discrete
droplets on a microfluidic array. In contrast to the
direct-addressing scheme that has been studied thus far
in the literature, we assign a small number of
independent control pins to a large number of
electrodes in the biochip, thereby reducing design
complexity and product cost. The design procedure
relies on a droplet-trace-based array partitioning
scheme and an efficient pin assignment technique,
referred to as the ``Connect-5 algorithm.'' The
proposed method is evaluated using a set of multiplexed
bioassays.",
acknowledgement = ack-nhfb,
articleno = "14",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
keywords = "biochips; droplets; microfluidics; Physical design
automation",
}
@Article{Rad:2007:EAP,
author = "Reza M. P. Rad and Mohammad Tehranipoor",
title = "Evaluating area and performance of hybrid {FPGAs} with
nanoscale clusters and {CMOS} routing",
journal = j-JETC,
volume = "3",
number = "3",
pages = "15:1--15:??",
month = nov,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1145/1295231.1295236",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Fri Jun 20 11:03:49 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Advances in fabrication technology of nanoscale
devices such as nanowires, carbon nanotubes and
molecular switches provide new opportunities for
implementing cluster-based FPGAs. Extensive research is
needed to evaluate area and performance of FPGAs made
from these devices and compare with their CMOS
counterparts. In this work, we propose a hybrid FPGA
that uses nanoscale clusters with a functionality
similar to the clusters of traditional CMOS FPGAs. The
proposed cluster is constructed by a crossbar of
nanowires and can be configured to implement the
required LUTs and intracluster MUXes. A CMOS interface
is also proposed to provide configuration and memory
elements for the nanoscale cluster. In the proposed
architecture, inter-cluster routing remains at CMOS
scale. We have developed models for area and delay of
clusters and interconnects of the proposed hybrid FPGA.
FPGA tools are configured with these models and used to
synthesize and configure the benchmark circuits onto
the hybrid FPGAs with NiSi nanowires or nanotubes.
Experiments are conducted to evaluate and compare area
and performance of the hybrid FPGA and traditional CMOS
FPGA (scaled to 22nm). Up to 82\\% area reduction was
obtained from implementing MCNC benchmarks on the
hybrid FPGA. Performance of the hybrid FPGA is shown to
be close to that of CMOS FPGA.",
acknowledgement = ack-nhfb,
articleno = "15",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
keywords = "CMOS; FPGA; Nanotechnology; performance; reliability",
}
@Article{Su:2008:HLS,
author = "Fei Su and Krishnendu Chakrabarty",
title = "High-level synthesis of digital microfluidic
biochips",
journal = j-JETC,
volume = "3",
number = "4",
pages = "1:1--1:??",
month = jan,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1324177.1324178",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Fri Jun 20 11:04:00 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Microfluidic biochips offer a promising platform for
massively parallel DNA analysis, automated drug
discovery, and real-time biomolecular recognition.
Current techniques for full-custom design of
droplet-based ``digital'' biochips do not scale well
for concurrent assays and for next-generation
system-on-chip (SOC) designs that are expected to
include microfluidic components. We propose a system
design methodology that attempts to apply classical
high-level synthesis techniques to the design of
digital microfluidic biochips. We focus here on the
problem of scheduling bioassay functions under resource
constraints. We first develop an optimal scheduling
strategy based on integer linear programming. However,
because the scheduling problem is NP-complete, we also
develop two heuristic techniques that scale well for
large problem instances. A clinical diagnostic
procedure, namely multiplexed in-vitro diagnostics on
human physiological fluids, is first used to illustrate
and evaluate the proposed method. Next, the synthesis
approach is applied to a protein assay, which serves as
a more complex bioassay application. The proposed
synthesis approach is expected to reduce human effort
and design cycle time, and it will facilitate the
integration of microfluidic components with
microelectronic components in next-generation SOCs.",
acknowledgement = ack-nhfb,
articleno = "1",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
keywords = "biochips; High-level synthesis; microfluidics;
scheduling; system-on-chip",
}
@Article{VanMeter:2008:ADM,
author = "Rodney {Van Meter} and W. J. Munro and Kae Nemoto and
Kohei M. Itoh",
title = "Arithmetic on a distributed-memory quantum
multicomputer",
journal = j-JETC,
volume = "3",
number = "4",
pages = "2:1--2:??",
month = jan,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1324177.1324179",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Fri Jun 20 11:04:00 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "We evaluate the performance of quantum arithmetic
algorithms run on a distributed quantum computer (a
quantum multicomputer). We vary the node capacity and
I/O capabilities, and the network topology. The
tradeoff of choosing between gates executed remotely,
through ``teleported gates'' on entangled pairs of
qubits (telegate), versus exchanging the relevant
qubits via quantum teleportation, then executing the
algorithm using local gates (teledata), is examined. We
show that the teledata approach performs better, and
that carry-ripple adders perform well when the
teleportation block is decomposed so that the key
quantum operations can be parallelized. A node size of
only a few logical qubits performs adequately provided
that the nodes have two transceiver qubits. A linear
network topology performs acceptably for a broad range
of system sizes and performance parameters. We
therefore recommend pursuing small, high-I/O bandwidth
nodes and a simple network. Such a machine will run
Shor's algorithm for factoring large numbers
efficiently.",
acknowledgement = ack-nhfb,
articleno = "2",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
keywords = "quantum computer architecture; Quantum computing",
}
@Article{Ma:2008:MCE,
author = "Xiaojun Ma and Jing Huang and Fabrizio Lombardi",
title = "A model for computing and energy dissipation of
molecular {QCA} devices and circuits",
journal = j-JETC,
volume = "3",
number = "4",
pages = "3:1--3:??",
month = jan,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1324177.1324180",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Fri Jun 20 11:04:00 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Quantum-dot Cellular Automata is an emerging
technology that offers significant improvements over
CMOS. Recently QCA has been advocated as a technology
for implementing reversible computing. However,
existing tools for QCA design and evaluation have
limited capabilities. This paper presents a new
mechanical-based model for computing in QCA. By
avoiding a full quantum-thermodynamical calculation, it
offers a classical view of the principles of QCA
operation and can be used in evaluating energy
dissipation for reversible computing. The proposed
model is mechanically based and is applicable to
six-dot (neutrally charged) QCA cells for molecular
implementation. The mechanical model consists of a
sleeve of changing shape; four electrically charged
balls are connected by a stick that rotates around an
axle in the sleeve. The sleeve acts as a clocking unit,
while the angular position of the stick within the
changing shape of the sleeve, identifies the phase for
quasi-adiabatic switching. A thermodynamic analysis of
the proposed model is presented. The behaviors of
various QCA basic devices and circuits are analyzed
using the proposed model. It is shown that the proposed
model is capable of evaluating the energy consumption
for reversible computing at device and circuit levels
for molecular QCA implementation. As applicable to QCA,
two clocking schemes are also analyzed for energy
dissipation and performance (in terms of number of
clocking zones).",
acknowledgement = ack-nhfb,
articleno = "3",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
keywords = "emerging technology; QCA; reversible computing;
thermodynamic analysis",
}
@Article{Chuang:2008:SRS,
author = "Min-Lun Chuang and Chun-Yao Wang",
title = "Synthesis of reversible sequential elements",
journal = j-JETC,
volume = "3",
number = "4",
pages = "4:1--4:??",
month = jan,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1324177.1324181",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Fri Jun 20 11:04:00 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "To construct a reversible sequential circuit,
reversible sequential elements are required. This work
presents novel designs of reversible sequential
elements such as the $D$ latch, $ J K$ latch, and $T$
latch. Based on these reversible latches, we construct
the designs of the corresponding flip-flops. Then we
further discuss the physical implementations of our
designs based on electron waveguide $Y$-branch switch
technology. Test costs, including test generation and
test application, of reversible sequential circuits
with these reversible flip-flops are also discussed.
Compared with previous work, the implementation cost of
our new designs, including the number of gates and the
number of garbage outputs, is significantly reduced.
The number of gates in our designs is 47.4\\% of the
designs in previous work on average. The number of
garbage outputs in our designs is 25\\% of the designs
in previous work on average.",
acknowledgement = ack-nhfb,
articleno = "4",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
keywords = "Reversible logic; sequential circuits; sequential
elements",
}
@Article{Metodi:2008:HLI,
author = "Tzvetan S. Metodi and Darshan D. Thaker and Andrew W.
Cross and Isaac L. Chuang and Frederic T. Chong",
title = "High-level interconnect model for the quantum logic
array architecture",
journal = j-JETC,
volume = "4",
number = "1",
pages = "1:1--1:??",
month = mar,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1330521.1330522",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Fri Jun 20 11:04:09 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "We summarize the main characteristics of the quantum
logic array (QLA) architecture with a careful look at
the key issues not described in the original conference
publications: primarily, the teleportation-based
logical interconnect. The design goal of the quantum
logic array architecture is to illustrate a model for a
large-scale quantum architecture that solves the
primary challenges of system-level reliability and data
distribution over large distances. The QLA's logical
interconnect design, which employs the quantum repeater
protocol, is in principle capable of supporting the
communication requirements for applications as large as
the factoring of a 2048-bit number using Shor's quantum
factoring algorithm. Our physical-level assumptions and
architectural component validations are based on the
trapped ion technology for implementing quantum
computing.",
acknowledgement = ack-nhfb,
articleno = "1",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
keywords = "fault tolerance; large scale; QLA; quantum; Quantum
computer architecture design; teleportation",
}
@Article{Donald:2008:RLS,
author = "James Donald and Niraj K. Jha",
title = "Reversible logic synthesis with {Fredkin} and {Peres}
gates",
journal = j-JETC,
volume = "4",
number = "1",
pages = "2:1--2:??",
month = mar,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1330521.1330523",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Fri Jun 20 11:04:09 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Reversible logic has applications in low-power
computing and quantum computing. Most reversible logic
synthesis methods are tied to particular gate types,
and cannot synthesize large functions. This article
extends RMRLS, a reversible logic synthesis tool, to
include additional gate types. While classic RMRLS can
synthesize functions using NOT, CNOT, and $n$-bit
Toffoli gates, our work details the inclusion of
$n$-bit Fredkin and Peres gates. We find that these
additional gates reduce the average gate count for
three-variable functions from 6.10 to 4.56, and improve
the synthesis results of many larger functions, both in
terms of gate count and quantum cost.",
acknowledgement = ack-nhfb,
articleno = "2",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
keywords = "Quantum computing; reversible logic",
}
@Article{Guiducci:2008:HPP,
author = "Carlotta Guiducci and Christine Nardini",
title = "High parallelism, portability, and broad
accessibility: {Technologies} for genomics",
journal = j-JETC,
volume = "4",
number = "1",
pages = "3:1--3:??",
month = mar,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1330521.1330524",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Fri Jun 20 11:04:09 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Biotechnology is an area of great innovations that
promises to have deep impact on everyday life thanks to
profound changes in biology, medicine, and health care.
This article will span from the description of the
biochemical principles of molecular biology to the
definition of the physics that supports the technology
and to the devices and algorithms necessary to observe
molecular events in a controlled, portable, and highly
parallel manner. Throughout this discussion, constant
attention will be given to the ultimate goals and
applications of these innovations as well as to the
related issues.",
acknowledgement = ack-nhfb,
articleno = "3",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
keywords = "biosensors; Genomics; microarrays; point-of-care
diagnostics",
}
@Article{Narayanan:2008:E,
author = "Vijaykrishnan Narayanan",
title = "Editorial",
journal = j-JETC,
volume = "4",
number = "2",
pages = "4:1--4:??",
month = apr,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1350763.1350764",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Fri Jun 20 11:04:16 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
acknowledgement = ack-nhfb,
articleno = "4",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Bahar:2008:IJA,
author = "R. Iris Bahar and Krishnendu Chakrabarty",
title = "Introduction to joint {ACM JETC\slash TODAES} special
issue on new, emerging, and specialized technologies",
journal = j-JETC,
volume = "4",
number = "2",
pages = "5:1--5:??",
month = apr,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1350763.1350765",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Fri Jun 20 11:04:16 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
acknowledgement = ack-nhfb,
articleno = "5",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Kuo:2008:MSA,
author = "Shih-Hsien Kuo and Bruce Tidor and Jacob White",
title = "A meshless, spectrally accurate, integral equation
solver for molecular surface electrostatics",
journal = j-JETC,
volume = "4",
number = "2",
pages = "6:1--6:??",
month = apr,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1350763.1350766",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Fri Jun 20 11:04:16 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "The need to determine electrostatic fields in domains
bounded by molecular surfaces arises in a number of
nanotechnology applications including: biomolecule
design, carbon nanotube simulation, and molecular
electron transport analysis. Molecular surfaces are
typically smooth, without the corners common in
electrical interconnect problems, but are often so
geometrically complicated that numerical evaluation of
the associated electrostatic fields is extremely
time-consuming. In this paper we describe and
demonstrate a meshless spectrally-accurate integral
equation method that only requires a description of the
molecular surface in the form of a collection of
surface points. Our meshless method is a synthesis of
techniques, suitably adapted, including: spherical
harmonic surface interpolation, spectral-element-like
integral equation discretization, integral
desingularization via variable transformation, and
matrix-implicit iterative matrix solution. The spectral
accuracy of this combined method is verified using
analytically solvable sphere and ellipsoid problems,
and then its accuracy and efficiency is demonstrated
numerically by solving capacitance and coupled
Poisson\slash linearized Poisson--Boltzmann problems
associated with a commonly used model of a molecule in
solution. The results demonstrate that for a tolerance
of 10$^{-3}$ this new approach reduces the number of
unknowns by as much as two orders of magnitude over the
more commonly used flat panel methods.",
acknowledgement = ack-nhfb,
articleno = "6",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
keywords = "integral equation; meshless; Poisson--Boltzmann
equation; spectral method",
}
@Article{Deng:2008:CNT,
author = "Jie Deng and Albert Lin and Gordon C. Wan and H.-S.
Philip Wong",
title = "Carbon nanotube transistor compact model for circuit
design and performance optimization",
journal = j-JETC,
volume = "4",
number = "2",
pages = "7:1--7:??",
month = apr,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1350763.1350767",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Fri Jun 20 11:04:16 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "In this paper, we describe the development of the
Stanford University Carbon Nanotube FET (CNFET) Compact
Model. The CNFET Model is a circuit-compatible, compact
model which describes enhancement-mode, CMOS-like
CNFETs. It can be used to simulate both functionality
and performance of large-scale circuits with hundreds
of CNFETs. To produce realistic and relevant results,
the model accounts for several practical non-idealities
such as scattering in the near-ballistic channel,
effects of the source/drain extension region, and
charge-screening for multiple-nanotube CNFETs. The
model also includes a full transcapacitance network for
more accurate transient and AC results. The Stanford
University CNFET Model is implemented in both HSPICE
macro language and VerilogA. The VerilogA
implementation shows speedups of roughly $ 7 \times $
-- $ 15 \times $ over HSPICE. Applications of the model
suggest that $n$- and $p$-CNFETs will have $ 6 \times $
and $ 13 \times $ speed advantage over Si $n$- and
$p$-MOSFETs respectively at the 32nm node, and that a
CNT density of 250 CNTs/$ \mu $ m is ideal for
multiple-nanotube gates. Such a compact CNFET model
will be absolutely essential in ushering in the Design
Era of CNFET circuits as carbon nanotube technology
outgrows its ``science discovery'' phase.",
acknowledgement = ack-nhfb,
articleno = "7",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
keywords = "carbon nanotube FET; CNT; compact model; HSPICE;
VerilogA",
}
@Article{Carmona:2008:FMA,
author = "Josep Carmona and Jordi Cortadella and Yousuke Takada
and Ferdinand Peper",
title = "Formal methods for the analysis and synthesis of
nanometer-scale cellular arrays",
journal = j-JETC,
volume = "4",
number = "2",
pages = "8:1--8:??",
month = apr,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1350763.1350768",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Fri Jun 20 11:04:16 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Nanometer-scale structures suitable for computing have
been investigated by several research groups in recent
years. A common feature of these structures is their
dynamic evolution through cascaded local interactions
embedded on a discrete grid. Finding configurations
capable of conducting computations is a task that often
requires tedious experiments in laboratories. Formal
methods --- though used extensively for the
specification and verification of software and hardware
computing systems --- are virtually unexplored with
respect to computational structures at atomic scales.
This paper presents a systematic approach toward the
application of formal methods in this context, using
techniques like abstraction, model-checking, and
symbolic representations of states to explore and
discover computational structures. The proposed
techniques are applied to a system of CO molecules on a
grid of Copper atoms, resulting in the design of a
complete library of combinational logic gates based on
this molecular system. The techniques are also applied
on (more general) systems of cellular automata that
employ an asynchronous mode of timing. The use of
formal methods may narrow the gap between Physical
Chemistry and Computer Science, allowing the
description of interactions of nanometer scale systems
on a level of abstraction suitable to devise computing
devices.",
acknowledgement = ack-nhfb,
articleno = "8",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
keywords = "cellular array; model checking; Nanocomputing;
symbolic techniques",
}
@Article{Crocker:2008:MQD,
author = "Michael Crocker and Michael Niemier and X. Sharon Hu
and Marya Lieberman",
title = "Molecular {QCA} design with chemically reasonable
constraints",
journal = j-JETC,
volume = "4",
number = "2",
pages = "9:1--9:??",
month = apr,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1350763.1350769",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Fri Jun 20 11:04:16 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "In this article we examine the impacts of the
fundamental constraints required for circuits and
systems made from molecular Quantum-dot Cellular
Automata (QCA) devices. Our design constraints are
``chemically reasonable'' in that we consider the
characteristics and dimensions of devices and
scaffoldings that have actually been fabricated. This
work is a necessary first step for any work in QCA CAD,
and can also help shape experiments in the physical
sciences for emerging, nano-scale devices. Our work
shows that QCA circuits, scaffoldings, substrates, and
devices should all be considered simultaneously.
Otherwise, there is a very real possibility that the
devices and scaffoldings that are eventually
manufactured will result in devices that only work in
isolation. ``Chemically reasonable'' also means that
expected manufacturing defects must be considered. In
our simulations we introduce defects associated with
self-assembled systems into various designs to begin to
define manufacturing tolerances. This work is
especially timely as experimentalists are beginning to
work on merging experimental tracks that address
devices and scaffolds --- and the end result should
facilitate correct logical operations.",
acknowledgement = ack-nhfb,
articleno = "9",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
keywords = "defects; Nanotechnology; physical simulation;
quantum-dot cellular automata",
}
@Article{Lebeck:2008:IDS,
author = "Alvin R. Lebeck and Krishnendu Chakrabarty",
title = "Introduction to {DAC 2007} special section",
journal = j-JETC,
volume = "4",
number = "3",
pages = "10:1--10:??",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1389089.1389090",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Thu Sep 4 14:23:10 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
acknowledgement = ack-nhfb,
articleno = "10",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Xu:2008:IDR,
author = "Tao Xu and Krishnendu Chakrabarty",
title = "Integrated droplet routing and defect tolerance in the
synthesis of digital microfluidic biochips",
journal = j-JETC,
volume = "4",
number = "3",
pages = "11:1--11:??",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1389089.1389091",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Thu Sep 4 14:23:10 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Microfluidic biochips are revolutionizing
high-throughput DNA sequencing, immunoassays, and
clinical diagnostics. As high-throughput bioassays are
mapped to digital microfluidic platforms, the need for
design automation techniques is being increasingly
felt. Moreover, as most applications of biochips are
safety-critical in nature, defect tolerance is an
essential system attribute. Several synthesis tools
have recently been proposed for the automated design of
biochips from the specifications of laboratory
protocols. However, only a few of these tools address
the problem of defect tolerance. In addition, most of
these methods do not consider the problem of droplet
routing in microfluidic arrays. These methods typically
rely on postsynthesis droplet routing to implement
biochemical protocols. Such an approach is not only
time consuming, but also imposes an undue burden on the
chip user. Postsynthesis droplet routing does not
guarantee that feasible droplet pathways can be found
for area-constrained biochip layouts; nonroutable
fabricated biochips must be discarded. We present a
synthesis tool that integrates defect tolerance and
droplet routing in the design flow. Droplet
routability, defined as the ease with which droplet
pathways can be determined, is estimated and integrated
in the synthesis procedure. Presynthesis and
postsynthesis defect-tolerance methods are also
presented. We use a large-scale protein assay as a case
study to evaluate the proposed synthesis method.",
acknowledgement = ack-nhfb,
articleno = "11",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
keywords = "biochips; microfluidics; module placement; physical
design automation",
}
@Article{Huang:2008:RAF,
author = "Tsung-Ching Huang and Kwang-Ting (Tim) Cheng and
Huai-Yuan Tseng and Chen-Pang Kung",
title = "Reliability analysis for flexible electronics: {Case}
study of integrated {a-Si:H} {TFT} scan driver",
journal = j-JETC,
volume = "4",
number = "3",
pages = "12:1--12:??",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1389089.1389092",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Thu Sep 4 14:23:10 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Flexible electronics fabricated on thin-film,
lightweight, and bendable substrates (e.g., plastic)
have great potential for novel applications in consumer
electronics such as flexible displays, e-paper, and
smart labels; however, the key elements, namely
thin-film transistors (TFTs), for implementing flexible
circuits often suffer from electrical instability.
Therefore, thorough reliability analysis is critical
for flexible circuit design to ensure that the circuit
will operate reliably throughout its lifetime. In this
article we propose a methodology for reliability
simulation of hydrogenated amorphous silicon (a-Si:H)
TFT circuits. We show that: (1) the threshold voltage
({\em V$_{TH}$ \/}) shift of a single TFT can be
estimated by analyzing its operating conditions; and
(2) the circuit lifetime can be predicted accordingly
by using SPICE-like simulators with proper modeling. We
also propose an algorithm to reduce the simulation time
by orders of magnitude, with good prediction accuracy.
To validate our analytical model and simulation
methodology, we compare simulation results with the
actual circuit measurements of an integrated a-Si:H TFT
scan driver fabricated on a glass substrate and we
demonstrate very good consistency.",
acknowledgement = ack-nhfb,
articleno = "12",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
keywords = "amorphous hydrogenated silicon (a-Si:H); flexible
electronics; reliability; scan driver; thin-film
transistor; threshold voltage",
}
@Article{Li:2008:ADP,
author = "Jing Li and Aditya Bansal and Swarop Ghosh and Kaushik
Roy",
title = "An alternate design paradigm for low-power, low-cost,
testable hybrid systems using scaled {LTPS TFTs}",
journal = j-JETC,
volume = "4",
number = "3",
pages = "13:1--13:??",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1389089.1389093",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Thu Sep 4 14:23:10 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "This article presents a holistic hybrid design
methodology for low-power, low-cost, testable digital
designs using low-temperature polycrystalline-silicon
thin-film transistors (LTPS TFTs). An alternate scaling
rule under low thermal budget (due to flexible
substrate) is developed to improve the performance of
TFTs in the presence of process variation. We
demonstrate that LTPS TFTs can be further optimized for
ultralow-power subthreshold operation with performances
comparable to contemporary single-crystal
silicon-on-insulator (c-Si SOI) devices after process
optimization. The optimized LTPS TFTs with high current
drivability and less variability can comprise a
promising low-cost option to augment Si CMOS
technology, opening up a plethora of new hybrid 3D
applications. We illustrate one such application: IC
testing. Testing of complex VLSI systems is a prime
concern due to design cost of DFT circuits, area/delay
overheads, and poor test confidence. To harness the
benefits of TFT technology, a novel low-power,
process-tolerant, generic, and reconfigurable test
structure designed using LTPS TFTs is proposed to
reduce the test cost, as well as to improve
diagnosability and verifiability, of complex VLSI
systems. Due to proper optimization of TFT devices, the
proposed test structure consumes low power but operates
with reasonable performance. Furthermore, the test
circuits do not consume any silicon area because they
can be integrated on-chip using 3D technology. Since
the test architecture is reconfigurable, this
eliminates the need to redesign built-in-self-test
(BIST) components that may vary from one processor
generation to another. We have developed test
structures using 200nm TFT devices and evaluated them
on designs implemented in 130nm bulk CMOS. For circuit
simulations, we have developed a SPICE-compatible model
for TFT devices. The BIST components designed using the
test structures operate at 0.8--4.3 GHz (compared to
8.2 GHz in bulk CMOS) with low power consumption. The
enhanced scan cells partially implemented in TFT (3D
hybrid design) consume \sim 24\% less power and \sim
15--20\% less area of Si die compared to conventional
bulk-Si design (2D planar design), with minimal delay
overhead.",
acknowledgement = ack-nhfb,
articleno = "13",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
keywords = "3D integration; BIST; DFT; generic; grain boundary
(GB); hybrid system; inherent variation;
low-temperature polycrystalline silicon (LTPS);
reconfigurable; thin-film transistor (TFT)",
}
@Article{Rad:2008:SNA,
author = "Reza Rad and Mohammad Tehranipoor",
title = "{SCT}: a novel approach for testing and configuring
nanoscale devices",
journal = j-JETC,
volume = "4",
number = "3",
pages = "14:1--14:??",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1389089.1389094",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Thu Sep 4 14:23:10 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Novel strategies are necessary to efficiently test and
configure emerging reconfigurable nanoscale devices, in
addition to providing defect tolerance. This is mainly
due to the high defect densities that are expected for
these devices. Among different approaches,
reconfiguration-based defect avoidance has proven to be
a practical solution. However, configuration time, test
time, and defect-map size remain among the major
challenges for these new devices. In this article, we
propose a new approach (called SCT) that simultaneously
performs test and configuration. The proposed method
uses a built-in self-test (BIST) scheme for test and
defect tolerance. The method is based on testing
reconfigurable nanoblocks at the time of implementing a
function of a desired application on that block. The
SCT method considerably reduces the total test and
configuration time. It also eliminates the need for
storing the location of defects in a defect map on- or
off-chip. The presented probabilistic analysis results
show the effectiveness of this method in terms of test
and configuration time for architectures with rich
interconnect resources. Also, a Verilog simulation
model is developed for crossbar-based
nano-architectures. This model is used to implement
several MCNC benchmarks based on the proposed SCT
method. The simulation results demonstrate efficiency
of the method in terms of test time and yield under
different defect rates.",
acknowledgement = ack-nhfb,
articleno = "14",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
keywords = "configuration and testing; crossbar; fault tolerance;
nanowire; reconfigurable nanoscale devices",
}
@Article{Xie:2008:ESI,
author = "Yuan Xie and Jason Cong and Paul Franzon",
title = "Editorial: {Special} issue on {$3$D} integrated
circuits and microarchitectures",
journal = j-JETC,
volume = "4",
number = "4",
pages = "15:1--15:??",
month = oct,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1412587.1412588",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Wed Mar 17 14:22:55 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
acknowledgement = ack-nhfb,
articleno = "15",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Kgil:2008:PUS,
author = "Taeho Kgil and Ali Saidi and Nathan Binkert and Steve
Reinhardt and Krisztian Flautner and Trevor Mudge",
title = "{PicoServer}: {Using} {$3$D} stacking technology to
build energy efficient servers",
journal = j-JETC,
volume = "4",
number = "4",
pages = "16:1--16:??",
month = oct,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1412587.1412589",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Wed Mar 17 14:22:55 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "This article extends our prior work to show that a
straightforward use of 3D stacking technology enables
the design of compact energy-efficient servers. Our
proposed architecture, called PicoServer, employs 3D
technology to bond one die containing several simple,
slow processing cores to multiple memory dies
sufficient for a primary memory. The multiple memory
dies are composed of DRAM. This use of 3D stacks
readily facilitates wide low-latency buses between
processors and memory. These remove the need for an L2
cache allowing its area to be re-allocated to
additional simple cores. The additional cores allow the
clock frequency to be lowered without impairing
throughput. Lower clock frequency means that thermal
constraints, a concern with 3D stacking, are easily
satisfied. We extend our original analysis on
PicoServer to include: (1) a wider set of server
workloads, (2) the impact of multithreading, and (3)
the on-chip DRAM architecture and system memory usage.
PicoServer is intentionally simple, requiring only the
simplest form of 3D technology where die are stacked on
top of one another. Our intent is to minimize risk of
introducing a new technology (3D) to implement a class
of low-cost, low-power compact server architectures.",
acknowledgement = ack-nhfb,
articleno = "16",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
keywords = "3D stacking technology; chip multiprocessor;
full-system simulation; Low power; Tier-1/2/3 server",
}
@Article{Ma:2008:IEF,
author = "Yuchun Ma and Yongxiang Liu and Eren Kursun and Glenn
Reinman and Jason Cong",
title = "Investigating the effects of fine-grain
three-dimensional integration on microarchitecture
design",
journal = j-JETC,
volume = "4",
number = "4",
pages = "17:1--17:??",
month = oct,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1412587.1412590",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Wed Mar 17 14:22:55 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "In this article we propose techniques that enable
efficient exploration of the 3D design space, where
each logical block can span more than one silicon
layer. Fine-grain 3D integration provides reduced
intrablock wire delay as well as improved power
consumption. However, the corresponding power and
performance advantage is usually underutilized, since
various implementations of multilayer blocks require
novel physical design and microarchitecture
infrastructure to explore 3D microarchitecture design
space. We develop a cubic packing engine which can
simultaneously optimize physical and architectural
design for efficient vertical integration. This
technique selects the individual unit designs from a
set of single-layer or multilayer implementations to
get the best microarchitectural design in terms of
performance, temperature, or both. Our experimental
results using a design driver of a high-performance
superscalar processor show a 36\% performance
improvement over traditional 2D for 2--4 layers and
14\% over 3D with single-layer unit implementations.
Since thermal characteristics of 3D integrated circuits
are among the main challenges, thermal-aware
floorplanning and thermal via insertion techniques are
employed to keep the peak temperatures below
threshold.",
acknowledgement = ack-nhfb,
articleno = "17",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
keywords = "3D integration; 3D packing; microarchitecture;
thermal",
}
@Article{Zhan:2008:AMA,
author = "Yong Zhan and Sachin S. Sapatnekar",
title = "Automated module assignment in stacked-{Vdd} designs
for high-efficiency power delivery",
journal = j-JETC,
volume = "4",
number = "4",
pages = "18:1--18:??",
month = oct,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1412587.1412591",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Wed Mar 17 14:22:55 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "With aggressive reductions in feature sizes and the
integration of multiple functionalities on the same
die, bottlenecks due to I/O pin limitations have become
a critical issue in today's VLSI designs, especially
for 3D IC technologies. To alleviate the pin limitation
problem, a stacked-Vdd circuit paradigm has recently
been proposed in the literature. However, for a circuit
designed using this paradigm, a significant amount of
power may be wasted if modules are not carefully
assigned to different Vdd domains. In this article, we
present a partition-based algorithm for efficiently
assigning modules at the floorplanning level, so as to
reuse currents between Vdd domains and minimize the
power wasted during the operation of the circuit.
Experimental results on both 3D and 2D ICs show that
compared with assigning modules to different Vdd
domains using enumeration and simulated annealing, our
algorithm can generate circuits with competitive power
and IR noise performance, while being orders of
magnitude faster.",
acknowledgement = ack-nhfb,
articleno = "18",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Ferri:2008:PYM,
author = "Cesare Ferri and Sherief Reda and R. Iris Bahar",
title = "Parametric yield management for {$3$D} {ICs}: {Models}
and strategies for improvement",
journal = j-JETC,
volume = "4",
number = "4",
pages = "19:1--19:??",
month = oct,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1412587.1412592",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Wed Mar 17 14:22:55 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Three-Dimensional (3D) Integrated Circuits (ICs) that
integrate die with Through-Silicon Vias (TSVs) promise
to continue system and functionality scaling beyond the
traditional geometric 2D device scaling. 3D integration
also improves the performance of ICs by reducing the
communication time between different chip components
through the use of short TSV-based vertical wires. This
reduction is particularly attractive in processors
where it is desirable to reduce the access time between
the main logic die and the L2 cache or the main memory
die. Process variations in 2D ICs lead to a drop in
parametric yield (as measured by speed, leakage and
sales profits), which forces manufacturers to speed bin
their chips and to sell slow chips at reduced prices.
In this paper we develop a model to quantify the impact
of process variations on the parametric yield of 3D
ICs, and then we propose a number of integration
strategies that use a graph-theoretic framework to
maximize the performance, parametric yield and profits
of 3D ICs. Comparing our proposed strategies to current
yield-oblivious methods, it is demonstrated that it is
possible to increase the number of 3D ICs in the
fastest speed bins by almost $ 2 \times $, while
simultaneously reducing the number of slow ICs by
29.4\%. This leads to an improvement in performance by
up to 6.45\% and an increase of about 12.48\% in total
sales revenue using up-to-date market price models.",
acknowledgement = ack-nhfb,
articleno = "19",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
keywords = "3D integration; leakage; performance; process
variations; yield management",
}
@Article{Miyakawa:2008:MST,
author = "Nobuaki Miyakawa and Eiri Hashimoto and Takanori
Maebashi and Natsuo Nakamura and Yutaka Sacho and
Shigeto Nakayama and Shinjiro Toyoda",
title = "Multilayer stacking technology using wafer-to-wafer
stacked method",
journal = j-JETC,
volume = "4",
number = "4",
pages = "20:1--20:??",
month = oct,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1412587.1412593",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Wed Mar 17 14:22:55 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "We have developed a new three-dimensional stacking
technology using the wafer-to-wafer stacked method.
Electrical conductivity between each wafer is almost
100\% and contact resistance is less than 0.7\Omega
between a through-silicon via (TSV) and a microbump. We
have also created a prototype of a three-layer stacking
device using our technology, where each wafer for the
stacking is fabricated by using 0.18um CMOS technology
based on 8-inch wafers. The device is operated by two
times the frequency of the multichip module (MCM)
device case using a two-dimensional device with
identical functions and minimally different power
consumption. The yields obtained from the results
comprising all functional tests are over 60\%.",
acknowledgement = ack-nhfb,
articleno = "20",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
keywords = "3D integration; design; hardware; stacking process",
}
@Article{Shukla:2009:GEI,
author = "Sandeep Shukla",
title = "Guest editorial: {IEEE\slash ACM} Symposium on
Nanoscale Architectures {(NANOARCH07)}",
journal = j-JETC,
volume = "5",
number = "1",
pages = "1:1--1:??",
month = jan,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1482613.1482614",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Wed Mar 17 14:23:14 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
acknowledgement = ack-nhfb,
articleno = "1",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Wang:2009:TAR,
author = "Shuo Wang and Lei Wang and Faquir Jain",
title = "Towards achieving reliable and high-performance
nanocomputing via dynamic redundancy allocation",
journal = j-JETC,
volume = "5",
number = "1",
pages = "2:1--2:??",
month = jan,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1482613.1482615",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Wed Mar 17 14:23:14 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Nanoelectronic devices are considered to be the
computational fabrics for the emerging nanocomputing
systems due to their ultra-high speed and integration
density. However, the imperfect bottom-up self-assembly
fabrication leads to excessive defects that have become
a barrier for achieving reliable computing. In
addition, transient errors continue to be a problem.
The massive parallelism rendered by nanoscale
integration opens up new opportunities but also poses
challenges on how to manage such massive resources for
reliable and high-performance computing. In this paper,
we propose a nanoarchitecture solution to address these
emerging challenges. By using dynamic redundancy
allocation, the massive parallelism is exploited to
jointly achieve fault (defect/error) tolerance and high
performance. Simulation results demonstrate the
effectiveness of the proposed technique under a range
of fault rates and operating conditions.",
acknowledgement = ack-nhfb,
articleno = "2",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
keywords = "hardware reliability; Nanoscale architecture;
performance; redundancy allocation; redundant design",
}
@Article{Wang:2009:ENP,
author = "Z. F. Wang and Huaixiu Zheng and Q. W. Shi and Jie
Chen",
title = "Emerging nanodevice paradigm: {Graphene-based}
electronics for nanoscale computing",
journal = j-JETC,
volume = "5",
number = "1",
pages = "3:1--3:??",
month = jan,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1482613.1482616",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Wed Mar 17 14:23:14 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "The continued miniaturization of silicon-based
electronic circuits is fast approaching its physical
limitations. It is unlikely that advances in
miniaturization, following the so-called Moore's Law,
can continue in the foreseeable future. Nanoelectronics
has to go beyond silicon technology. New device
paradigms based on nanoscale materials, such as
molecular electronic devices, spin devices and
carbon-based devices, will emerge. In this article, we
introduce a nanodevice paradigm: graphene
nanoelectronics. Due to its unique quantum effects and
electronic properties, researchers predict that
graphene-based devices may replace carbon nanotube
devices and become major building blocks for future
nanoscale computing. To manifest its unique electronic
properties, we present some of our recent designs,
namely a graphene-based switch, a negative differential
resistance (NDR) device and a random access memory
array (RAM). Since these basic devices are the building
blocks for large-scale circuits, our findings can help
researchers construct useful computing systems and
study graphene-based circuit performance in the
future.",
acknowledgement = ack-nhfb,
articleno = "3",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
keywords = "Graphene device; memory structure; negative
differential resistance; tight-binding model",
}
@Article{Taskin:2009:SRB,
author = "Baris Taskin and Andy Chiu and Jonathan Salkind and
Daniel Venutolo",
title = "A shift-register-based {QCA} memory architecture",
journal = j-JETC,
volume = "5",
number = "1",
pages = "4:1--4:??",
month = jan,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1482613.1482617",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Wed Mar 17 14:23:14 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "A quantum-dot cellular automata (QCA) design of an $ n
\times m$-bit, shift-register-based memory architecture
is presented. The architecture maintains data at a
stable conformation, which is contrary to traditional
data in-motion concept for QCA architectures. The
memory architecture is based on an existing
dual-phase-synchronized, line-based, one-bit QCA memory
cell building block that provides size and latency
improvements over other known one-bit memory cells
through its novel clocking scheme. Read/write latencies
up to \sim 2X lower than the existing tile-based
architecture with three-phase, line-based memory cells
are obtained. Simulations with QCADesigner and HDLQ are
performed on a sample $ 4 \times 8$ bit memory
architecture implementation.",
acknowledgement = ack-nhfb,
articleno = "4",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
keywords = "clocking; memory design; Quantum-dot cellular
automata",
}
@Article{Huo:2009:SBN,
author = "Dennis Huo and Qiaoyan Yu and David Wolpert and Paul
Ampadu",
title = "A simulator for ballistic nanostructures in a {$2$-D}
electron gas",
journal = j-JETC,
volume = "5",
number = "1",
pages = "5:1--5:??",
month = jan,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1482613.1482618",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Wed Mar 17 14:23:14 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "A multipurpose simulator for ballistic nanostructures,
based on classical mechanics of electrons at the Fermi
level, has been successfully implemented. Despite the
simplicity of the model, the simulator successfully
reproduces a number of experimental results, and is
shown to consistently match observed current-voltage
characteristics and magnetoresistance phenomena. The
simulator results provide design guidelines for devices
which operate on ballistic transport principles. Using
the simulator, preliminary logic structures have been
designed based on the ballistic deflection
transistor.",
acknowledgement = ack-nhfb,
articleno = "5",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
keywords = "2DEG; Ballistic transport; nanoelectronic device;
transistor",
}
@Article{Bahar:2009:ISS,
author = "R. Iris Bahar",
title = "Introduction to special section: {Best} of {NANOARCH
2008}",
journal = j-JETC,
volume = "5",
number = "2",
pages = "6:1--6:??",
month = jul,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1543438.1543439",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Wed Mar 17 14:23:24 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
acknowledgement = ack-nhfb,
articleno = "6",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Mishra:2009:LPF,
author = "Prateek Mishra and Anish Muttreja and Niraj K. Jha",
title = "Low-power {FinFET} circuit synthesis using multiple
supply and threshold voltages",
journal = j-JETC,
volume = "5",
number = "2",
pages = "7:1--7:??",
month = jul,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1543438.1543440",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Wed Mar 17 14:23:24 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "According to Moore's law, the number of transistors in
a chip doubles every 18 months. The increased
transistor-count leads to increased power density.
Thus, in modern circuits, power efficiency is a central
determinant of circuit efficiency. With scaling,
leakage power accounts for an increasingly larger
portion of the total power consumption in deep
submicron technologies (>40\%).\par
FinFET technology has been proposed as a promising
alternative to deep submicron bulk CMOS technology,
because of its better scalability, short-channel
characteristics, and ability to suppress leakage
current and mitigate device-to-device variability when
compared to bulk CMOS. The subthreshold slope of a
FinFET is approximately 60mV which is close to
ideal.\par
In this article, we propose a methodology for low-power
FinFET based circuit synthesis. A mechanism called TCMS
(Threshold Control through Multiple Supply Voltages)
was previously proposed for improving the power
efficiency of FinFET based global interconnects. We
propose a significant generalization of TCMS to the
design of any logic circuit. This scheme represents a
significant divergence from the conventional multiple
supply voltage schemes considered in the past. It also
obviates the need for voltage level-converters. We
employ accurate delay and power estimates using table
look-up methods based on HSPICE simulations for supply
voltage and threshold voltage optimization.
Experimental results demonstrate that TCMS can provide
power savings of 67.6\% and device area savings of
65.2\% under relaxed delay constraints. Two other
variants of TCMS are also proposed that yield similar
benefits. We compare our scheme to extended cluster
voltage scaling (ECVS), a popular dual- {\em V$_{dd}$
\/} scheme presented in the literature. ECVS makes use
of voltage level-converters. Even when it is assumed
that these level-converters have zero delay, thus
significantly favoring ECVS in time-constrained power
optimization, TCMS still outperforms ECVS.",
acknowledgement = ack-nhfb,
articleno = "7",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
keywords = "linear programming; Low-power; synthesis; TCMS",
}
@Article{Crocker:2009:DFQ,
author = "Michael Crocker and X. Sharon Hu and Michael Niemier",
title = "Defects and faults in {QCA}-based {PLAs}",
journal = j-JETC,
volume = "5",
number = "2",
pages = "8:1--8:??",
month = jul,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1543438.1543441",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Wed Mar 17 14:23:24 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Defect tolerance will be critical in any system with
nanoscale feature sizes. This article examines some
fundamental aspects of defect tolerance for a
reconfigurable system based on Quantum-dot Cellular
Automata (QCA). We analyze a novel, QCA-based,
Programmable Logic Array (PLA) structure, develop an
implementation independent fault model, and discuss how
expected defects and faults might affect yield. Within
this context, we introduce techniques for mapping
Boolean logic functions to a defective QCA-based PLA.
Simulation results show that our new mapping techniques
can achieve higher yields than existing techniques.",
acknowledgement = ack-nhfb,
articleno = "8",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
keywords = "defects; faults; logic mapping; Nanotechnology;
quantum-dot cellular automata",
}
@Article{Wu:2009:SCD,
author = "Xiaoxia Wu and Paul Falkenstern and Krishnendu
Chakrabarty and Yuan Xie",
title = "Scan-chain design and optimization for
three-dimensional integrated circuits",
journal = j-JETC,
volume = "5",
number = "2",
pages = "9:1--9:??",
month = jul,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1543438.1543442",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Wed Mar 17 14:23:24 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Scan chains are widely used to improve the testability
of integrated circuit (IC) designs and to facilitate
fault diagnosis. For traditional 2D IC design, a number
of design techniques have been proposed in the
literature for scan-chain routing and scan-cell
partitioning. However, these techniques are not
effective for three-dimensional (3D) technologies,
which have recently emerged as a promising means to
continue technology scaling. In this article, we
propose two techniques for designing scan chains in 3D
ICs, with given constraints on the number of
through-silicon-vias (TSVs). The first technique is
based on a genetic algorithm (GA), and it addresses the
ordering of cells in a single scan chain. The second
optimization technique is based on integer linear
programming (ILP); it addresses single-scan-chain
ordering as well as the partitioning of scan flip-flops
into multiple scan chains. We compare these two methods
by conducting experiments on a set of ISCAS'89
benchmark circuits. The first conclusion obtained from
the results is that 3D scan-chain optimization achieves
significant wire-length reduction compared to 2D
counterparts. The second conclusion is that the
ILP-based technique provides lower bounds on the
scan-chain interconnect length for 3D ICs, and it
offers considerable reduction in wire-length compared
to the GA-based heuristic method.",
acknowledgement = ack-nhfb,
articleno = "9",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
keywords = "3D ICs; genetic algorithm; integer linear programming;
LP relaxation; randomized rounding; scan-chain design",
}
@Article{Datta:2009:EPT,
author = "Siddhartha Datta and Bharat Joshi and Arun Ravindran
and Arindam Mukherjee",
title = "Efficient parallel testing and diagnosis of digital
microfluidic biochips",
journal = j-JETC,
volume = "5",
number = "2",
pages = "10:1--10:??",
month = jul,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1543438.1543443",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Wed Mar 17 14:23:24 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Microfluidics-based biochips consist of microfluidic
arrays on rigid substrates through which movement of
fluids is tightly controlled to facilitate biological
reactions. Biochips are soon expected to revolutionize
biosensing, clinical diagnostics, environmental
monitoring, and drug discovery. Critical to the
deployment of the biochips in such diverse areas is the
dependability of these systems. Thus robust testing and
diagnosis techniques are required to ensure adequate
level of system dependability. Due to the underlying
mixed technology and mixed energy domains, such
biochips exhibit unique failure mechanisms and defects.
In this article efficient parallel testing and
diagnosis algorithms are presented that can detect and
locate single as well as multiple faults in a
microfluidic array without flooding the array, a
problem that has hampered realistic implementation of
several existing strategies. The fault diagnosis
algorithms are well suited for built-in self-test that
could drastically reduce the operating cost of
microfluidic biochip. Also, the proposed alogirthms can
be used both for testing and fault diagnosis during
field operation as well as increasing yield during the
manufacturing phase of the biochip. Furthermore, these
algorithms can be applied to both online and offline
testing and diagnosis. Analytical results suggest that
these strategies that can be used to design highly
dependable biochip systems.",
acknowledgement = ack-nhfb,
articleno = "10",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
keywords = "defect tolerance; droplet flooding; fault tolerance;
Microfluidic biochip; microfluidics; multiple faults;
reconfigurability; testing",
}
@Article{Tahoori:2009:LOD,
author = "Mehdi B. Tahoori",
title = "Low-overhead defect tolerance in crossbar
nanoarchitectures",
journal = j-JETC,
volume = "5",
number = "2",
pages = "11:1--11:??",
month = jul,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1543438.1543444",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Wed Mar 17 14:23:24 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "It is anticipated that the number of defects in
nanoscale devices fabricated using bottom-up
self-assembly process is significantly higher than that
for CMOS devices fabricated by conventional top-down
lithography patterning. This is mainly because of
inherent lack of control in self-assembly fabrication
as well as atomic scale of devices. The goal of defect
tolerance, as an integral part of nano computing, is to
obtain error-free computation from such fabrics
containing defective elements.\par
In this article, an application-independent defect
tolerant scheme for reconfigurable crossbar array
nanoarchitectures is presented. The main feature of
this approach is that the existence and location of
defective resources within the nano-fabric are hidden
from the entire design flow, resulting in minimum
post-fabrication customization per chip and minimum
changes to the entire design and synthesis flow. It is
also shown how to drastically minimize the area
overhead associated with this flow. The proposed
technique requires extraction of regular yet incomplete
defect-free subsets, in contrast to previously proposed
complete defect-free subsets. This can greatly reduce
the area overhead required for defect tolerance while
not sacrificing logic mapping or signal routing
capabilities. Extensive simulation results confirm
considerable reduction in the area overhead without any
negative impact on the usability of modified
defect-free subsets.",
acknowledgement = ack-nhfb,
articleno = "11",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
keywords = "Defect tolerance; nanotechnology; reconfigurable
architectures",
}
@Article{Chakraborty:2009:SAD,
author = "Rajat Subhra Chakraborty and Swarup Bhunia",
title = "A study of asynchronous design methodology for robust
{CMOS}-nano hybrid system design",
journal = j-JETC,
volume = "5",
number = "3",
pages = "12:1--12:??",
month = aug,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1568485.1568486",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Wed Mar 17 14:23:41 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Among the emerging alternatives to CMOS, molecular
electronics based diode-resistor crossbar fabric has
generated considerable interest in recent times. Logic
circuit design with future nano-scale molecular devices
using dense and regular crossbar fabrics is promising
in terms of integration density, performance and power
dissipation. However, circuit design using molecular
switches involve some major challenges: (1) lack of
voltage gain of these switches that prevents logic
cascading; (2) large output voltage level degradation;
(3) vulnerability to parameter variations that affect
yield and robustness of operation; and (4) high defect
rate. In this article, we analyze some of the above
challenges and investigate the effectiveness of
asynchronous design methodology in a hybrid system
design platform using molecular crossbar and CMOS
interfacing elements. We explore different approaches
of asynchronous circuit design and compare their
suitability in terms of several circuit design
parameters. We then develop the methodology and an
automated synthesis flow to support two different
asynchronous design approaches ({\em Micropipelines\/}
and {\em Four phase Dual-rail\/}) for system designs
using nano-crossbar logic stages and CMOS interface
data-storage elements. Circuit-level simulation results
for several benchmarks show considerable advantage in
terms of performance and robustness at moderate area
and power overhead compared to two different
synchronous implementations.",
acknowledgement = ack-nhfb,
articleno = "12",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
keywords = "Asynchronous design; CMOS-nano co-design; dual-rail
circuits; logic degradation; micropipelines; nano-scale
crossbar; robust design",
}
@Article{Zhang:2009:HNCa,
author = "Wei Zhang and Niraj K. Jha and Li Shang",
title = "A hybrid {Nano\slash CMOS} dynamically reconfigurable
system --- {Part II}: {Design} optimization flow",
journal = j-JETC,
volume = "5",
number = "3",
pages = "13:1--13:??",
month = aug,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1568485.1568487",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Wed Mar 17 14:23:41 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "In Part I of this work, a hybrid nano/CMOS
reconfigurable architecture, called NATURE, was
described. It is composed of CMOS reconfigurable logic
and interconnect fabric, and nonvolatile nano on-chip
memory. Through its support for cycle-by-cycle runtime
reconfiguration and a highly-efficient computation
model, temporal logic folding, NATURE improves logic
density and area-delay product by more than an order of
magnitude compared to existing CMOS-based
field-programmable gate arrays (FPGAs). NATURE can be
fabricated using mainstream photo-lithography
fabrication techniques. Thus, it offers a currently
commercially feasible architecture with high
performance, superior logic density, and excellent
runtime design flexibility.\par
In Part II of this work, we present an integrated
design and optimization flow for NATURE, called
NanoMap. Given an input design specified in
register-transfer level (RTL) and/or gate-level VHDL,
NanoMap optimizes and implements the design on NATURE
through logic mapping, temporal clustering, temporal
placement, and routing. As opposed to other design
tools for traditional FPGAs, NanoMap supports and
leverages temporal logic folding by integrating novel
mapping techniques. It can automatically explore and
identify the best temporal logic folding configuration,
targeting area, delay or area-delay product
optimization. A force-directed scheduling technique is
used to optimize and balance resource usage across
different folding cycles. By supporting logic folding,
NanoMap can provide significant design flexibility in
performing area-delay trade-offs under various
user-specified constraints. We present details of the
mapping procedure and results for different
architectural instances. Experimental results
demonstrate that NanoMap can judiciously trade off area
and delay targeting different optimization goals, and
effectively exploit the advantages of NATURE.\par
Part I of this work will appear in JETC Vol. 5, No.
4.",
acknowledgement = ack-nhfb,
articleno = "13",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
keywords = "design optimization flow; Dynamic reconfiguration;
logic folding; NATURE",
}
@Article{Simsir:2009:HNC,
author = "Muzaffer O. Simsir and Srihari Cadambi and Franjo
Ivanv{\v{c}}i{\'c} and Martin Roetteler and Niraj K.
Jha",
title = "A hybrid nano-{CMOS} architecture for defect and fault
tolerance",
journal = j-JETC,
volume = "5",
number = "3",
pages = "14:1--14:??",
month = aug,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1568485.1568488",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Wed Mar 17 14:23:41 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "As the end of the semiconductor roadmap for CMOS
approaches, architectures based on nanoscale molecular
devices are attracting attention. Among several
alternatives, silicon nanowires and carbon nanotubes
are the two most promising nanotechnologies according
to the ITRS. These technologies may enable scaling deep
into the nanometer regime. However, they suffer from
very defect-prone manufacturing processes. Although the
reconfigurability property of the nanoscale devices can
be used to tolerate high defect rates, it may not be
possible to locate all defects. With very high device
densities, testing each component may not be possible
because of time or technology restrictions. This points
to a scenario in which even though the devices are
tested, the tests are not very comprehensive at
locating defects, and hence the shipped chips are still
defective. Moreover, the devices in the nanometer range
will be susceptible to transient faults which can
produce arbitrary soft errors. Despite these drawbacks,
it is possible to make nanoscale architectures
practical and realistic by introducing defect and fault
tolerance. In this article, we propose and evaluate a
hybrid nanowire-CMOS architecture that addresses all
three problems --- namely high defect rates, unlocated
defects, and transient faults --- at the same time.
This goal is achieved by using multiple levels of
redundancy and majority voters. A key aspect of the
architecture is that it contains a judicious balance of
both nanoscale and traditional CMOS components. A
companion to the architecture is a compiler with
heuristics to quickly determine if logic can be mapped
onto partially defective nanoscale elements. The
heuristics make it possible to introduce
defect-awareness in placement and routing. The
architecture and compiler are evaluated by applying the
complete design flow to several benchmarks.",
acknowledgement = ack-nhfb,
articleno = "14",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
keywords = "Defect tolerance; nanotechnology; nanowires",
}
@Article{Wang:2009:UQD,
author = "Shuo Wang and Jianwei Dai and El-Sayed Hasaneen and
Lei Wang and Faquir Jain",
title = "Utilizing quantum dot transistors with programmable
threshold voltages for low-power mobile computing",
journal = j-JETC,
volume = "5",
number = "3",
pages = "15:1--15:??",
month = aug,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1568485.1568489",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Wed Mar 17 14:23:41 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Power consumption poses one of the fundamental
barriers for deploying mobile computing devices in
energy-constrained situations with varying operation
conditions. In particular, leakage power is projected
to increase exponentially in future semiconductor
process nodes. This challenging problem is pressing for
renewed focus on power-performance optimization at all
levels of design abstract, from novel device structures
to fundamental shifts in design paradigm. In this
article, we propose to exploit the programmable
threshold voltage quantum dot (QD) transistors to
reduce leakage thereby improving the energy efficiency
for mobile computing. The unique programmability and
reconfigurability enabled by QD transistors extend our
capability in design optimization for new
power-performance trade-offs. Simulation results
demonstrate the significant leakage reduction over
conventional techniques.",
acknowledgement = ack-nhfb,
articleno = "15",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
keywords = "Low power; threshold voltage and quantum dot
transistor",
}
@Article{Zhang:2009:HNCb,
author = "Wei Zhang and Niraj K. Jha and Li Shang",
title = "A hybrid {nano\slash CMOS} dynamically reconfigurable
system --- {Part I}: {Architecture}",
journal = j-JETC,
volume = "5",
number = "4",
pages = "16:1--16:??",
month = nov,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1629091.1629092",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Wed Mar 17 14:23:55 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Rapid progress on nanodevices points to a promising
direction for future circuit design. However, since
nanofabrication techniques are not yet mature,
implementation of nanocircuits, at least on a large
scale, in the near future is infeasible. To ease
fabrication and overcome the problem of high defect
levels in nanotechnology, hybrid nano/CMOS
reconfigurable architectures are attractive choices.
Moreover, if the current photolithography fabrication
process can be used to manufacture the hybrid chips,
the benefits of nanotechnologies can be realized
today.\par
Traditional reconfigurable architectures can only
support partial or coarse-grain runtime reconfiguration
due to their limited on-chip storage and long off-chip
reconfiguration latency. Recent progress on nano Random
Access Memories (RAMs), such as carbon nanotube-based
RAM (NRAM), Phase-Change Memory (PCM), magnetoresistive
RAM (MRAM), etc., provides us with a chance to realize
on-chip fine-grain runtime reconfiguration. These nano
RAMs have good compatibility with the current
fabrication process. By utilizing them in the hybrid
design, we can take advantage of both CMOS and
nanotechnology, and greatly improve the logic density,
resource utilization, and performance of our
design.\par
In this article, we propose a high-performance
reconfigurable architecture, called NATURE, that
utilizes CMOS logic and nano RAMs. An automatic design
flow for NATURE is presented in Part II of the article.
In NATURE, the highly dense nonvolatile nano RAMs are
distributed throughout the chip to allow large embedded
on-chip configuration storage, which enables fast
reading and hence supports fine-grain runtime
reconfiguration and temporal logic folding of a circuit
before being mapped to the architecture. Temporal logic
folding can significantly increase the logic density of
NATURE (by over an order of magnitude for large
circuits) while remaining competitive in performance
and power consumption. For ease of exposition, we use
NRAMs to illustrate various concepts in this article
due to the excellent properties of NRAMs. However,
other nano RAMs can also be used instead. Experimental
results based on NRAMs establish the efficacy of
NATURE.",
acknowledgement = ack-nhfb,
articleno = "16",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
keywords = "logic folding; NRAM; runtime reconfiguration",
}
@Article{Zhang:2009:DSE,
author = "Wei Zhang and Niraj K. Jha and Li Shang",
title = "Design space exploration and data memory architecture
design for a hybrid {nano\slash CMOS} dynamically
reconfigurable architecture",
journal = j-JETC,
volume = "5",
number = "4",
pages = "17:1--17:??",
month = nov,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1629091.1629093",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Wed Mar 17 14:23:55 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "In recent years, research on nanotechnology has
advanced rapidly. Novel nanodevices have been
developed, such as those based on carbon nanotubes,
nanowires, etc. Using these emerging nanodevices,
diverse nanoarchitectures have been proposed. Among
them, hybrid nano/CMOS reconfigurable architectures
have attracted attention because of their advantages in
performance, integration density, and fault tolerance.
Recently, a high-performance hybrid nano/CMOS
reconfigurable architecture, called NATURE, was
presented. NATURE comprises CMOS reconfigurable logic
and interconnect fabric, and
CMOS-fabrication-compatible nanomemory. High-density,
fast nano RAMs are distributed in NATURE as on-chip
storage to store multiple reconfiguration copies for
each reconfigurable element. It enables cycle-by-cycle
runtime reconfiguration and a highly efficient
computational model, called temporal logic folding.
Through logic folding, NATURE provides more than an
order of magnitude improvement in logic density and
area-delay product, and significant design flexibility
in performing area-delay trade-offs, at the same
technology node. Moreover, NATURE can be fabricated
using mainstream photolithography fabrication
techniques. Hence, it offers a currently commercially
viable reconfigurable architecture with high
performance, superior logic density, and outstanding
design flexibility, which is very attractive for
deployment in cost-conscious embedded systems.\par
In order to fully explore the potential of NATURE and
further improve its performance, in this article, a
thorough design space exploration is conducted to
optimize its architecture. Investigations in terms of
different logic element architectures, interconnect
designs, and various technologies for nano RAMs are
presented. Nano RAMs can not only be used as storage
for configuration bits, but the high density of nano
RAMs also makes them excellent candidates for
large-capacity on-chip data storage in NATURE. Many
logic- and memory-intensive applications, such as video
and image processing, require large storage of temporal
results. To enhance the capability of NATURE for
implementing such applications, we investigate the
design of nano data memory structures in NATURE and
explore the impact of memory density. Experimental
results demonstrate significant throughput improvements
due to area saving from logic folding and parallel data
processing.",
acknowledgement = ack-nhfb,
articleno = "17",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
keywords = "logic folding; Nano data RAM; runtime
reconfiguration",
}
@Article{Tang:2009:DET,
author = "Weiguo Tang and Lei Wang and Fabrizio Lombardi",
title = "A defect\slash error-tolerant nanosystem architecture
for {DSP}",
journal = j-JETC,
volume = "5",
number = "4",
pages = "18:1--18:??",
month = nov,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1629091.1629094",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Wed Mar 17 14:23:55 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Emerging technologies such as silicon NanoWires (NW)
and Carbon NanoTubes (CNT) have shown great potential
for building the next generation of computing systems
in the nano ranges. However, the excessive number of
defects originating from bottom-up fabrication (such as
a self-assembly process) poses a pressing challenge for
achieving scalable system integration. This article
proposes a new nanosystem architecture that employs
nanowire crossbars for Digital Signal Processing (DSP)
applications. Distributed arithmetic is utilized such
that complex signal processing computation can be
mapped into regular memory operations, thus making this
architecture well suited for implementation by nanowire
crossbars. Furthermore, the inherent features of
DSP-type computation provide new insights to remedy
errors (as logic/computational manifestation of
defects). A new defect/error-tolerant technique that
exploits algorithmic error compensation is proposed; at
system level different trade-offs between correctness
in output and performance are established while
retaining low overhead in its implementation. As an
instance of its application, the proposed approach has
been utilized to a generic DSP nanosystem performing
frequency-selective filtering. Simulation results show
that the proposed nanoDSP introduces only a minor
performance degradation under high defect rates and at
a range of operational conditions. The proposed
technique also features good scalability and viability
for various DSP applications.",
acknowledgement = ack-nhfb,
articleno = "18",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
keywords = "algorithmic error compensation; Distributed
arithmetic; DSP nanosystem; inner product",
}
@Article{Dysart:2009:OWR,
author = "Timothy J. Dysart and Peter M. Kogge",
title = "Organizing wires for reliability in magnetic {QCA}",
journal = j-JETC,
volume = "5",
number = "4",
pages = "19:1--19:??",
month = nov,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1629091.1629095",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Wed Mar 17 14:23:55 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "This article investigates, via analytic modeling, how
a magnetic QCA wire should be organized to provide the
highest reliability. We compare a nonredundant wire and
two redundant wire organizations. For all three
organizations, a fault rate per unit length is used for
comparison; additionally, since extra components are
necessary to implement the redundant organizations,
these components are faulty as well. We show that the
difference between these two fault rates is the main
driver for selecting a wire organization. Lastly, we
develop a guideline for selecting the most reliable
wire organization during the circuit design process.",
acknowledgement = ack-nhfb,
articleno = "19",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
keywords = "modular redundancy; nanomagnet logic; QCA",
}
@Article{Chakrabarty:2010:E,
author = "Krishnendu Chakrabarty",
title = "Editorial",
journal = j-JETC,
volume = "6",
number = "1",
pages = "1:1--1:??",
month = mar,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1721650.1721651",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Wed Mar 17 14:24:05 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
acknowledgement = ack-nhfb,
articleno = "1",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Lee:2010:FBP,
author = "Chun-Yi Lee and Niraj K. Jha",
title = "{FinFET}-based power simulator for interconnection
networks",
journal = j-JETC,
volume = "6",
number = "1",
pages = "2:1--2:??",
month = mar,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1721650.1721652",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Wed Mar 17 14:24:05 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Double-gate FETs, specifically FinFETs, are emerging
as promising substitutes for bulk CMOS at the 32nm
technology node and beyond because of the various
obstacles to scaling faced by CMOS, such as
short-channel effects, leakage power, and process
variations. Another trend in chip multiprocessor design
is incorporation of sophisticated on-chip
interconnection networks. However, such networks are
significant power-consumers. In this article, we
address these two trends by presenting a power
simulator for FinFET-based on-chip interconnection
networks. It estimates both dynamic and leakage power.
We present results for various FinFET design styles and
temperatures (since leakage power changes drastically
with temperature), and show that one FinFET design
style may be much superior to another from the power
consumption point of view.",
acknowledgement = ack-nhfb,
articleno = "2",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
keywords = "FinFETs; interconnection network; power consumption;
power simulator",
}
@Article{Liu:2010:RSO,
author = "Yang Liu and Chris Dwyer and Alvin R. Lebeck",
title = "Routing in self-organizing nano-scale irregular
networks",
journal = j-JETC,
volume = "6",
number = "1",
pages = "3:1--3:??",
month = mar,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1721650.1721653",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Wed Mar 17 14:24:05 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "The integration of novel nanotechnologies onto silicon
platforms is likely to increase fabrication defects
compared with traditional CMOS technologies.
Furthermore, the number of nodes connected with these
networks makes acquiring a global defect map
impractical. As a result, on-chip networks will provide
defect tolerance by self-organizing into irregular
topologies. In this scenario, simple static routing
algorithms based on regular physical topologies, such
as meshes, will be inadequate. Additionally, previous
routing approaches for irregular networks assume
abundant resources and do not apply to this domain of
resource-constrained self-organizing nano-scale
networks. Consequently, routing algorithms that work in
irregular networks with limited resources are
needed.\par
In this article, we explore routing for self-organizing
nano-scale irregular networks in the context of a
Self-Organizing SIMD Architecture (SOSA). Our approach
trades configuration time and a small amount of storage
for reduced communication latency. We augment an Euler
path-based routing technique for trees to generate
static shortest paths between certain pairs of nodes
while remaining deadlock free. Simulations of several
applications executing on SOSA show our proposed
routing algorithm can reduce execution time by 8\% to
30\%.",
acknowledgement = ack-nhfb,
articleno = "3",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
keywords = "data parallel; DNA; nanocomputing; Self-organizing;
SIMD",
}
@Article{Kocak:2010:IDT,
author = "Taskin Kocak and Dhiraj Pradhan",
title = "Introduction to design techniques for energy
harvesting",
journal = j-JETC,
volume = "6",
number = "2",
pages = "4:1--4:??",
month = jun,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1773814.1773815",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Tue Sep 7 08:33:18 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
acknowledgement = ack-nhfb,
articleno = "4",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Wenck:2010:SST,
author = "Justin Wenck and Jamie Collier and Jeff Siebert and
Rajeevan Amirtharajah",
title = "Scaling self-timed systems powered by mechanical
vibration energy harvesting",
journal = j-JETC,
volume = "6",
number = "2",
pages = "5:1--5:??",
month = jun,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1773814.1773816",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Tue Sep 7 08:33:18 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Passive energy harvesting from mechanical vibration
has wide application in wearable devices and wireless
sensors to complement or replace batteries. Energy
harvesting efficiency can be increased by eliminating
AC/DC conversion. A test chip demonstrating
self-timing, power-on reset circuitry, and dynamic
memory for energy harvesting AC voltages has been
designed in 180 nm CMOS and tested. An energy scalable
DSP architecture implements FIR filters that consume as
little as 170 pJ per output sample. The on-chip DRAM
retains data for up to 28 ms while register data is
retained down to a supply voltage of 153 mV. Circuit
operation is confirmed for supply frequencies between
60 Hz and 1 kHz with power consumption below 130$ \mu
$W. Reaching the limits of miniaturization will require
approaching the limits of power dissipation. We
extrapolate from this DSP architecture to find the
minimum volume required for mechanical vibration energy
harvesting sensors.",
acknowledgement = ack-nhfb,
articleno = "5",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
keywords = "AC power supply; DRAM; energy harvesting; energy-aware
systems; integrated circuits; low-power design;
power-on reset; scaling; self-timed",
}
@Article{Wang:2010:DCS,
author = "W. S. Wang and T. O'Donnell and N. Wang and M. Hayes
and B. O'Flynn and C. O'Mathuna",
title = "Design considerations of sub-{mW} indoor light energy
harvesting for wireless sensor systems",
journal = j-JETC,
volume = "6",
number = "2",
pages = "6:1--6:??",
month = jun,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1773814.1773817",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Tue Sep 7 08:33:18 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "For most wireless sensor networks, one common and
major bottleneck is the limited battery lifetime. The
frequent maintenance efforts associated with battery
replacement significantly increase the system
operational and logistics cost. Unnoticed power
failures on nodes will degrade the system reliability
and may lead to system failure. In building management
applications, to solve this problem, small energy
sources such as indoor light energy are promising to
provide long-term power to these distributed wireless
sensor nodes. This article provides comprehensive
design considerations for an indoor light energy
harvesting system for building management applications.
Photovoltaic cells characteristics, energy storage
units, power management circuit design, and power
consumption pattern of the target mote are presented.
Maximum power point tracking circuits are proposed
which significantly increase the power obtained from
the solar cells. The novel fast charge circuit reduces
the charging time. A prototype was then successfully
built and tested in various indoor light conditions to
discover the practical issues of the design. The
evaluation results show that the proposed prototype
increases the power harvested from the PV cells by 30\%
and also accelerates the charging rate by 34\% in a
typical indoor lighting condition. By entirely
eliminating the rechargeable battery as energy storage,
the proposed system would expect an operational
lifetime 10--20 years instead of the current less than
6 months battery lifetime.",
acknowledgement = ack-nhfb,
articleno = "6",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
keywords = "Design consideration; energy harvesting; indoor light
illuminance; maximum power point tracking; PV cells
wireless sensor node; supercapacitor",
}
@Article{Moser:2010:EMF,
author = "Clemens Moser and Jian-Jia Chen and Lothar Thiele",
title = "An energy management framework for energy harvesting
embedded systems",
journal = j-JETC,
volume = "6",
number = "2",
pages = "7:1--7:??",
month = jun,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1773814.1773818",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Tue Sep 7 08:33:18 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Energy harvesting (also known as energy scavenging) is
the process of generating electrical energy from
environmental energy sources. There exists a variety of
different energy sources such as solar energy, kinetic
energy, or thermal energy. In recent years, this term
has been frequently applied in the context of small
autonomous devices such as wireless sensor nodes. In
this article, a framework for energy management in
energy harvesting embedded systems is presented. As a
possible scenario, we focus on wireless sensor nodes
that are powered by solar cells. We demonstrate that
classical power management solutions have to be
reconceived and/or new problems arise if perpetual
operation of the system is required. In particular, we
provide a set of algorithms and methods for various
application scenarios, including real-time scheduling,
application rate control, as well as reward
maximization. The goal is to optimize the performance
of the application subject to given energy constraints.
Our methods optimize the system performance which, for
example, allows the usage of smaller solar cells and
smaller batteries. Furthermore, we show how to
dimension important system parameters like the minimum
battery capacity or a sufficient prediction horizon.
Our theoretical results are supported by simulations
using long-term measurements of solar energy in an
outdoor environment. In contrast to previous works, we
present a formal framework which is able to capture the
performance, the parameters, and the energy model of
various energy harvesting systems. We combine different
viewpoints, include corresponding simulation results,
and provide a thorough discussion of implementation
aspects.",
acknowledgement = ack-nhfb,
articleno = "7",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
keywords = "embedded systems; energy harvesting; model predictive
control; Power management; real-time scheduling; reward
maximization",
}
@Article{Mohanty:2010:UDS,
author = "Saraju P. Mohanty and Dhiraj K. Pradhan",
title = "{ULS}: a dual-{$ V_{th} $} \slash high-$ \kappa $
nano-{CMOS} universal level shifter for system-level
power management",
journal = j-JETC,
volume = "6",
number = "2",
pages = "8:1--8:??",
month = jun,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1773814.1773819",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Tue Sep 7 08:33:18 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Power dissipation is a major bottleneck for emerging
applications, such as implantable systems, digital
cameras, and multimedia processors. Each of these
applications is essentially designed as an
Analog/Mixed-Signal System-on-a-Chip (AMS-SoC). These
AMS-SoCs are typically operated from a single
power-supply source which is a battery providing a
constant supply voltage. In order to reduce power
dissipation of the AMS-SoCs, multiple-supply voltage
and/or variable-supply voltage is used as an attractive
low-power design approach. In the
multiple-/variable-supply voltage AMS-SoCs the use of a
DC-to-DC voltage-level shifter is critical. The
voltage-level shifter is an overhead when its own power
dissipation is high. In this article a new DC-to-DC
voltage-level shifter is introduced that performs
level-up shifting, level-down shifting, and blocking of
voltages and is called Universal Level Shifter (ULS).
The ULS is a unique component that reduces dynamic
power and leakage of the AMS-SoCs while facilitating
their reconfigurability. The system-level architectures
for three AMS-SoCs, such as Drug Delivery
Nano-Electro-Mechanical-System (DDNEMS), Secure Digital
Camera (SDC), and Net-centric Multimedia Processor
(NMP) are introduced to demonstrate the use the ULS for
system-level power management. The article presents a
design flow and an algorithm for optimal design of the
ULS using a dual- $ V_{th} $ high-$ \kappa $ technique
for efficient realization of ULS. A prototype ULS is
presented for 32nm nano-CMOS technology node. The
robustness of the ULS design is examined by performing
three types of analysis, such as parametric, load, and
power. It is observed that the ULS produces a stable
output for voltages as low as 0.35 V and loads varying
from 50 {\em fF\/} to 120 {\em fF}. The average power
dissipation of the ULS with a 82 {\em fF\/} capacitive
load is 5 $ \mu ${\em W}.",
acknowledgement = ack-nhfb,
articleno = "8",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
keywords = "/metal-gate nano-CMOS; Analog/Mixed-Signal
System-on-a-Chip (AMS-SoC); DC-to-DC voltage-level
shifter; dual-threshold voltage; high-\kappa low-power
design; nanoscale CMOS; Power management; system-level
power management",
}
@Article{Dai:2010:ITA,
author = "Jianwei Dai and Lei Wang and Fabrizio Lombardi",
title = "An information-theoretic analysis of quantum-dot
cellular automata for defect tolerance",
journal = j-JETC,
volume = "6",
number = "3",
pages = "9:1--9:??",
month = aug,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1777401.1777402",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Tue Sep 7 08:33:31 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Quantum-dot cellular automata (QCA) has been advocated
as a promising emerging nanotechnology for designing
future nanocomputing systems. However, at device level,
the large number of expected defects represents a
significant hurdle for reliable computation in
QCA-based systems. In this paper, we present an
information-theoretic approach to investigate the
relationship between defect tolerance and redundancy in
QCA devices. By modeling defect-prone QCA devices as
unreliable information processing media, we determine
the information transfer capacity, as bound on the
reliability that QCA devices can achieve. The proposed
method allows to evaluate the effectiveness of
redundancy-based defect tolerance in an effective and
quantitative manner.",
acknowledgement = ack-nhfb,
articleno = "9",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
keywords = "defect tolerance; information theoretic analysis; QCA;
reliability",
}
@Article{Zhang:2010:LPN,
author = "Wei Zhang and Niraj K. Jha and Li Shang",
title = "Low-power {$3$D} nano\slash {CMOS} hybrid dynamically
reconfigurable architecture",
journal = j-JETC,
volume = "6",
number = "3",
pages = "10:1--10:??",
month = aug,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1777401.1777403",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Tue Sep 7 08:33:31 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "In order to continue technology scaling beyond CMOS,
diverse nanoarchitectures have been proposed in recent
years based on emerging nanodevices, such as nanotubes,
nanowires, etc. Among them, some hybrid nano/CMOS
reconfigurable architectures enjoy the advantage that
they can be fabricated using photolithography. NATURE
is one such architecture that we have proposed
recently. It comprises CMOS reconfigurable logic and
CMOS fabrication-compatible nano RAMs. It uses
distributed high-density and fast nano RAMs as on-chip
storage for storing multiple reconfiguration copies,
enabling fine-grain cycle-by-cycle reconfiguration. It
supports a highly efficient computational model, called
temporal logic folding, which makes possible more than
an order of magnitude improvement in logic density and
area-delay product, significant power reduction, and
significant design flexibility in performing area-delay
trade-offs.\par
In this article, we extend NATURE in various
dimensions, evaluating various FPGA approaches in the
context of today's emerging technologies. First, we
explore the introduction of embedded coarse-grain
modules in the fine-grain NATURE architecture and
present a unified dynamically reconfigurable
architecture, which can significantly enhance NATURE's
computation power for data-dominated applications.
Second, we explore a 3D architecture for NATURE in
which the nano RAM for reconfiguration storage is on
one layer and the rest of the CMOS logic on another
layer. This leads to further improvements in logic
density and performance. Finally, we explore the
possibility of using FinFETs, an emerging double-gate
CMOS technology, to implement NATURE. Since power
consumption is an important consideration in the deep
nanometer regime, especially for FPGAs, we present a
back-gate biasing methodology for flexible threshold
voltage adjustment in FinFETs to significantly reduce
NATURE's power consumption. Simulation results
demonstrate the efficacy of the proposed methods.",
acknowledgement = ack-nhfb,
articleno = "10",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
keywords = "3D design; Coarse-grain; FinFET; runtime
reconfiguration",
}
@Article{Zhao:2010:ICP,
author = "Yang Zhao and Tao Xu and Krishnendu Chakrabarty",
title = "Integrated control-path design and error recovery in
the synthesis of digital microfluidic lab-on-chip",
journal = j-JETC,
volume = "6",
number = "3",
pages = "11:1--11:??",
month = aug,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1777401.1777404",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Tue Sep 7 08:33:31 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Recent advances in digital microfluidics have led to
tremendous interest in miniaturized lab-on-chip devices
for biochemical analysis. Synthesis tools have also
emerged for the automated design of lab-on-chip from
the specifications of laboratory protocols. However,
none of these tools consider control flow or address
the problem of recovering from fluidic errors that can
occur during on-chip bioassay execution. We present a
synthesis method that incorporates control paths and an
error-recovery mechanism in the design of a digital
microfluidic lab-on-chip. Based on error-propagation
estimates, we determine the best locations for fluidic
checkpoints during biochip synthesis. A microcontroller
coordinates the implementation of the
control-flow-based bioassay by intercepting the
synthesis results that are mapped to the software
programs. Real-life bioassay applications are used as
case studies to evaluate the proposed design method.
For a representative protein assay, compared to a
baseline chip design, the biochip with a control path
can reduce the completion time by 30\% when errors
occur during the implementation of the bioassay.",
acknowledgement = ack-nhfb,
articleno = "11",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
keywords = "biochips; Error recovery; microfluidics; synthesis",
}
@Article{Bhoj:2010:GDF,
author = "Ajay N. Bhoj and Niraj K. Jha",
title = "Gated-diode {FinFET DRAMs}: Device and circuit
design-considerations",
journal = j-JETC,
volume = "6",
number = "4",
pages = "12:1--12:??",
month = dec,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1877745.1877746",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Mon Mar 28 12:17:02 MDT 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
acknowledgement = ack-nhfb,
articleno = "12",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Saeedi:2010:RCS,
author = "Mehdi Saeedi and Morteza Saheb Zamani and Mehdi
Sedighi and Zahra Sasanian",
title = "Reversible circuit synthesis using a cycle-based
approach",
journal = j-JETC,
volume = "6",
number = "4",
pages = "13:1--13:??",
month = dec,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1877745.1877747",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Mon Mar 28 12:17:02 MDT 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
acknowledgement = ack-nhfb,
articleno = "13",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Thapliyal:2010:DRS,
author = "Himanshu Thapliyal and Nagarajan Ranganathan",
title = "Design of reversible sequential circuits optimizing
quantum cost, delay, and garbage outputs",
journal = j-JETC,
volume = "6",
number = "4",
pages = "14:1--14:??",
month = dec,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1877745.1877748",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Mon Mar 28 12:17:02 MDT 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
acknowledgement = ack-nhfb,
articleno = "14",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Singh:2010:CPD,
author = "Montek Singh and Steven M. Nowick",
title = "Call for Papers: Deadline: {March 15, 2011}",
journal = j-JETC,
volume = "6",
number = "4",
pages = "15:1--15:??",
month = dec,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1877745.1877749",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Mon Mar 28 12:17:02 MDT 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
acknowledgement = ack-nhfb,
articleno = "15",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Das:2011:ISI,
author = "Shamik Das and Garrett S. Rose",
title = "Introduction to Special Issue: Highlights of
{NANOARCH'09}",
journal = j-JETC,
volume = "7",
number = "1",
pages = "1:1--1:??",
month = jan,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/1899390.1899391",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Mon Mar 28 12:17:03 MDT 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
acknowledgement = ack-nhfb,
articleno = "1",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Dingler:2011:PEI,
author = "Aaron Dingler and Michael T. Niemier and Xiaobo Sharon
Hu and Evan Lent",
title = "Performance and Energy Impact of Locally Controlled
{NML} Circuits",
journal = j-JETC,
volume = "7",
number = "1",
pages = "2:1--2:??",
month = jan,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/1899390.1899392",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Mon Mar 28 12:17:03 MDT 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
acknowledgement = ack-nhfb,
articleno = "2",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Gaillardon:2011:MNB,
author = "P.-E. Gaillardon and F. Clermidy and I. O'Connor and
J. Liu and M. Amadou and G. Nicolescu",
title = "Matrix Nanodevice-Based Logic Architectures and
Associated Functional Mapping Method",
journal = j-JETC,
volume = "7",
number = "1",
pages = "3:1--3:??",
month = jan,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/1899390.1899393",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Mon Mar 28 12:17:03 MDT 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
acknowledgement = ack-nhfb,
articleno = "3",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Haron:2011:RRN,
author = "Nor Zaidi Haron and Said Hamdioui",
title = "Redundant Residue Number System Code for
Fault-Tolerant Hybrid Memories",
journal = j-JETC,
volume = "7",
number = "1",
pages = "4:1--4:??",
month = jan,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/1899390.1899394",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Mon Mar 28 12:17:03 MDT 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
acknowledgement = ack-nhfb,
articleno = "4",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Shang:2011:INC,
author = "Li Shang and Qianfan Xu",
title = "Introduction to nanophotonic communication technology
integration",
journal = j-JETC,
volume = "7",
number = "2",
pages = "5:1--5:??",
month = jun,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/1970406.1970407",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Thu Aug 18 12:25:12 MDT 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
acknowledgement = ack-nhfb,
articleno = "5",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Beausoleil:2011:LSI,
author = "Raymond G. Beausoleil",
title = "Large-scale integrated photonics for high-performance
interconnects",
journal = j-JETC,
volume = "7",
number = "2",
pages = "6:1--6:??",
month = jun,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/1970406.1970408",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Thu Aug 18 12:25:12 MDT 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
acknowledgement = ack-nhfb,
articleno = "6",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Biberman:2011:PNC,
author = "Aleksandr Biberman and Kyle Preston and Gilbert Hendry
and Nicol{\'a}s Sherwood-Droz and Johnnie Chan and
Jacob S. Levy and Michal Lipson and Keren Bergman",
title = "Photonic network-on-chip architectures using
multilayer deposited silicon materials for
high-performance chip multiprocessors",
journal = j-JETC,
volume = "7",
number = "2",
pages = "7:1--7:??",
month = jun,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/1970406.1970409",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Thu Aug 18 12:25:12 MDT 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
acknowledgement = ack-nhfb,
articleno = "7",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Li:2011:IHN,
author = "Zheng Li and Moustafa Mohamed and Xi Chen and Hongyu
Zhou and Alan Mickelson and Li Shang and Manish
Vachharajani",
title = "{Iris}: a hybrid nanophotonic network design for
high-performance and low-power on-chip communication",
journal = j-JETC,
volume = "7",
number = "2",
pages = "8:1--8:??",
month = jun,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/1970406.1970410",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Thu Aug 18 12:25:12 MDT 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
acknowledgement = ack-nhfb,
articleno = "8",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Cianchetti:2011:LLH,
author = "Mark J. Cianchetti and David H. Albonesi",
title = "A low-latency, high-throughput on-chip optical router
architecture for future chip multiprocessors",
journal = j-JETC,
volume = "7",
number = "2",
pages = "9:1--9:??",
month = jun,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/1970406.1970411",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Thu Aug 18 12:25:12 MDT 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
acknowledgement = ack-nhfb,
articleno = "9",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Zhang:2011:FBP,
author = "Meng Zhang and Niraj K. Jha",
title = "{FinFET}-Based Power Management for Improved {DPA}
Resistance with Low Overhead",
journal = j-JETC,
volume = "7",
number = "3",
pages = "10:1--10:??",
month = aug,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/2000502.2000503",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Thu Aug 18 12:25:13 MDT 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Differential power analysis (DPA) is a side-channel
attack that statistically analyzes the power
consumption of a cryptographic system to obtain secret
information. This type of attack is well known as a
major threat to information security. Effective
solutions with low energy and area cost for improved
DPA resistance are urgently needed, especially for
energy-constrained modern devices that are often in the
physical proximity of attackers. This article presents
a novel countermeasure against DPA attacks on smart
cards and other digital ICs based on FinFETs, an
emerging substitute for bulk CMOS at the 22nm
technology node and beyond. We exploit the adaptive
power management characteristic of FinFETs to generate
a high level of noise at critical moments in the
execution of a cryptosystem to thwart DPA attacks.",
acknowledgement = ack-nhfb,
articleno = "10",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Choi:2011:EQI,
author = "Byung-Soo Choi and Rodney {Van Meter}",
title = "On the Effect of Quantum Interaction Distance on
Quantum Addition Circuits",
journal = j-JETC,
volume = "7",
number = "3",
pages = "11:1--11:17",
month = aug,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/2000502.2000504",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Thu Aug 18 12:25:13 MDT 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "We investigate the theoretical limits of the effect of
the quantum interaction distance on the speed of exact
quantum addition circuits. For this study, we exploit
graph embedding for quantum circuit analysis. We study
a logical mapping of qubits and gates of any $ \Omega
(\log n)$-depth quantum adder circuit for two $n$-qubit
registers onto a practical architecture, which limits
interaction distance to the nearest neighbors only and
supports only one- and two-qubit logical gates.
Unfortunately, on the chosen $k$-dimensional practical
architecture, we prove that the depth lower bound of
any exact quantum addition circuits is no longer $
\Omega (\log n)$, but $ \Omega (\root k \of n)$.",
acknowledgement = ack-nhfb,
articleno = "11",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Goren:2011:DAN,
author = "Sezer G{\"o}ren and H. Fatih Ugurdag and Okan Palaz",
title = "Defect-Aware Nanocrossbar Logic Mapping through Matrix
Canonization Using Two-Dimensional Radix Sort",
journal = j-JETC,
volume = "7",
number = "3",
pages = "12:1--12:??",
month = aug,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/2000502.2000505",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Thu Aug 18 12:25:13 MDT 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Nanocrossbars (i.e., nanowire crossbars) offer extreme
logic densities but come with very high defect rates;
stuck-open/closed, broken nanowires. Achieving
reasonable yield and utilization requires logic mapping
that is defect-aware even at the crosspoint level. Such
logic mapping works with a defect map per each
manufactured chip. The problem can be expressed as
matching of two bipartite graphs; one for the logic to
be implemented and other for the nanocrossbar. This
article shows that the problem becomes a Bipartite
SubGraph Isomorphism (BSGI) problem within
sub-nanocrossbars free of stuck-closed faults. Our
heuristic KNS-2DS is an iterative rough canonizer with
approximately O(N2) complexity followed by an O(N3)
matching algorithm.",
acknowledgement = ack-nhfb,
articleno = "12",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Devadoss:2011:PQT,
author = "Rajeswari Devadoss and Kolin Paul and M.
Balakrishnan",
title = "{p-QCA}: a Tiled Programmable Fabric Architecture
Using Molecular Quantum-Dot Cellular Automata",
journal = j-JETC,
volume = "7",
number = "3",
pages = "13:1--13:??",
month = aug,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/2000502.2000506",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Thu Aug 18 12:25:13 MDT 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Quantum-dot cellular automata is an interesting
computation fabric with many never-seen-before
properties. However, no programmable fabric scheme has
utilized all these properties effectively. We propose
an architecture for a programmable device using
molecular QCA which exploits all the specialities of
the fabric. The architecture taps the flexibility
provided by the clocking system of molecular QCA to
build a simple tile-based programmable device with the
3-input Majority gate as the fundamental logic
element.",
acknowledgement = ack-nhfb,
articleno = "13",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Singh:2011:ISI,
author = "Montek Singh and Steven M. Nowick",
title = "Introduction to Special Issue: Asynchrony in System
Design",
journal = j-JETC,
volume = "7",
number = "4",
pages = "14:1--14:??",
month = dec,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/2043643.2043644",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Thu Dec 15 09:46:08 MST 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
acknowledgement = ack-nhfb,
articleno = "14",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Vacca:2011:ASN,
author = "Marco Vacca and Mariagrazia Graziano and Maurizio
Zamboni",
title = "Asynchronous Solutions for Nanomagnetic Logic
Circuits",
journal = j-JETC,
volume = "7",
number = "4",
pages = "15:1--15:??",
month = dec,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/2043643.2043645",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Thu Dec 15 09:46:08 MST 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "In the years to come new solutions will be required to
overcome the limitations of scaled CMOS technology. One
approach is to adopt Nano-Magnetic Logic Circuits,
highly appealing for their extremely reduced power
consumption. Despite the interesting nature of this
approach, many problems arise when this technology is
considered for real designs. The wire is the most
critical of these problems from the circuit
implementation point of view. It works as a pipelined
interconnection, and its delay in terms of clock cycles
depends on its length. Serious complications arise at
the design phase, both in terms of synthesis and of
physical design.",
acknowledgement = ack-nhfb,
articleno = "15",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Zhang:2011:NPD,
author = "Xuefu Zhang and Delong Shang and Fei Xia and Alex
Yakovlev",
title = "A Novel Power Delivery Method for Asynchronous Loads
in Energy Harvesting Systems",
journal = j-JETC,
volume = "7",
number = "4",
pages = "16:1--16:??",
month = dec,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/2043643.2043646",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Thu Dec 15 09:46:08 MST 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "For systems depending on power harvesting, a
fundamental contradiction in the power delivery chain
has existed between conventional synchronous
computational loads requiring relatively stable Vdd and
power harvesters unable to supply it. DC/DC conversion
has therefore been an integral part of such systems to
resolve this contradiction. On the other hand,
asynchronous computational loads, in addition to their
potential power-saving capabilities, can be made
tolerant to a much wider range of Vdd variance. This
may open up opportunities for much more energy
efficient methods of power delivery. This article
presents in-depth investigations into the behavior and
performance of different on-chip power delivery methods
driving both asynchronous and synchronous loads
directly from a harvester source.",
acknowledgement = ack-nhfb,
articleno = "16",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Plana:2011:SDI,
author = "Luis A. Plana and David Clark and Simon Davidson and
Steve Furber and Jim Garside and Eustace Painkras and
Jeffrey Pepper and Steve Temple and John Bainbridge",
title = "{SpiNNaker}: Design and Implementation of a {GALS}
Multicore {System-on-Chip}",
journal = j-JETC,
volume = "7",
number = "4",
pages = "17:1--17:??",
month = dec,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/2043643.2043647",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Thu Dec 15 09:46:08 MST 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "The design and implementation of globally asynchronous
locally synchronous systems-on-chip is a challenging
activity. The large size and complexity of the systems
require the use of computer-aided design (CAD) tools
but, unfortunately, most tools do not work adequately
with asynchronous circuits. This article describes the
successful design and implementation of SpiNNaker, a
GALS multicore system-on-chip. The process was
completed using commercial CAD tools from synthesis to
layout. A hierarchical methodology was devised to deal
with the asynchronous sections of the system,
encapsulating and validating timing assumptions at each
level. The crossbar topology combined with a pipelined
asynchronous fabric implementation allows the on-chip
network to meet the stringent requirements of the
system.",
acknowledgement = ack-nhfb,
articleno = "17",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Galceran-Oms:2011:MTU,
author = "Marc Galceran-Oms and Alexander Gotmanov and Jordi
Cortadella and Mike Kishinevsky",
title = "Microarchitectural Transformations Using Elasticity",
journal = j-JETC,
volume = "7",
number = "4",
pages = "18:1--18:??",
month = dec,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/2043643.2043648",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Thu Dec 15 09:46:08 MST 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Elasticity is a paradigm that tolerates the variations
in computation and communication delays. By applying
elastic transformations that allow varying the original
timing, circuits can be optimized beyond the
conventional rigid transformations that do not modify
the external timing. Pipelining is one of the classical
techniques to improve the throughput of a circuit. This
article reveals how elasticity can be effectively and
practically used to derive pipelined circuits by using
correct-by-construction transformations that can be
fully automated. Two designs, one of them industrial,
are used to demonstrate how the area-performance
trade-off can be explored using elasticity.",
acknowledgement = ack-nhfb,
articleno = "18",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Sheikh:2011:EEP,
author = "Basit Riaz Sheikh and Rajit Manohar",
title = "{Energy-Efficient} Pipeline Templates for
{High-Performance} Asynchronous Circuits",
journal = j-JETC,
volume = "7",
number = "4",
pages = "19:1--19:??",
month = dec,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/2043643.2043649",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Thu Dec 15 09:46:08 MST 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "We present two novel energy-efficient pipeline
templates for high throughput asynchronous circuits.
The proposed templates, called N-P and N-Inverter
pipelines, use a single-track handshake protocol. There
are multiple stages of logic within each pipeline. The
proposed techniques minimize handshake overheads
associated with input tokens and intermediate logic
nodes within a pipeline template. Each template can
pack a significant amount of logic in a single stage,
while still maintaining a fast cycle time of only 18
transitions. Noise and timing robustness constraints of
our pipelined circuits are quantified across all
process corners. We present completion detection scheme
based on wide NOR gates, which results in significant
latency and energy savings especially as the number of
outputs increase.",
acknowledgement = ack-nhfb,
articleno = "19",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Matherat:2011:RCC,
author = "Philippe Matherat and Marc-Thierry Jaekel",
title = "Relativistic Causality and Clockless Circuits",
journal = j-JETC,
volume = "7",
number = "4",
pages = "20:1--20:??",
month = dec,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/2043643.2043650",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Thu Dec 15 09:46:08 MST 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Time plays a crucial role in the performance of
computing systems. The accurate modelling of logical
devices, and of their physical implementations,
requires an appropriate representation of time and of
all properties that depend on this notion. The need for
a proper model, particularly acute in the design of
clockless delay-insensitive (DI) circuits, leads one to
reconsider the classical descriptions of time and of
the resulting order and causal relations satisfied by
logical operations. This questioning meets the
criticisms of classical spacetime formulated by
Einstein when founding relativity theory and is
answered by relativistic conceptions of time and
causality. Applying this approach to clockless circuits
and considering the trace formalism, we rewrite
Udding's rules, which characterize communications
between DI components. We exhibit their intrinsic
relation with relativistic causality. For that purpose,
we introduce relativistic generalizations of traces,
called R-traces, which provide a pertinent description
of communications and compositions of DI components.",
acknowledgement = ack-nhfb,
articleno = "20",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Crocker:2012:RPA,
author = "Michael Crocker and Michael Niemier and X. Sharon Hu",
title = "A Reconfigurable {PLA} Architecture for Nanomagnet
Logic",
journal = j-JETC,
volume = "8",
number = "1",
pages = "1:1--1:??",
month = feb,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2093145.2093146",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Tue Feb 28 16:37:42 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "In order to continue the performance and scaling
trends that we have come to expect from Moore's Law,
many emergent computational models, devices, and
technologies are actively being studied to either
replace or augment CMOS technology. Nanomagnet Logic
(NML) is one such alternative. NML operates at room
temperature, it has the potential for low power
consumption, and it is CMOS compatible. In this
article, we present an NML programmable logic array
(PLA) based on a previously proposed reprogrammable
quantum-dot cellular automata PLA design. We also
discuss the fabrication and simulation validation of
the circuit structures unique to the NML PLA, present
area, energy, and delay estimates for the NML PLA,
compare the area of NML PLAs to other reprogrammable
nanotechnologies, and analyze how architectural-level
redundancy will affect performance and defect tolerance
in NML PLAs.",
acknowledgement = ack-nhfb,
articleno = "1",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Henry:2012:TNH,
author = "Michael B. Henry and Leyla Nazhandali",
title = "From Transistors to {NEMS}: Highly Efficient
Power-Gating of {CMOS} Circuits",
journal = j-JETC,
volume = "8",
number = "1",
pages = "2:1--2:??",
month = feb,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2093145.2093147",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Tue Feb 28 16:37:42 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "A rapidly growing class of battery constrained
electronic applications are those with very long sleep
periods, such as structural health monitoring systems,
biomedical implants, and wireless border security
cameras. The traditional method for sleep-mode power
reduction, transistor power gating, has drawbacks,
including performance loss and residual leakage. This
article presents a thorough evaluation of a new
nanotechnology-enabled power gating structure,
CMOS-compatible NEMS switches, in the presence of
aggressive supply voltage scaling. Due to the infinite
off-resistance of the NEMS switches, the average power
consumption of an FFT processor performing 1 FFT per
hour drops by around 30 times compared to a
transistor-based power gating implementation.",
acknowledgement = ack-nhfb,
articleno = "2",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Tolbert:2012:MDA,
author = "Jeremy R. Tolbert and Pratik Kabali and Simeranjit
Brar and Saibal Mukhopadhyay",
title = "Modeling and Designing for Accuracy and Energy
Efficiency in Wireless Electroencephalography Systems",
journal = j-JETC,
volume = "8",
number = "1",
pages = "3:1--3:??",
month = feb,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2093145.2093148",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Tue Feb 28 16:37:42 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Remote wireless monitoring of physiological signals
has emerged as a key enabler for biotelemetry and can
significantly improve the delivery of healthcare.
Improving the energy efficiency and battery lifetime of
the monitoring units without sacrificing the acquired
signal quality is a key challenge in large-scale
deployment of bioelectronic systems for remote wireless
monitoring. In this article, we present a design
methodology for accuracy aware, energy efficient
wireless monitoring of electroencephalography (EEG)
data. The proposed design performs a real-time accuracy
energy trade-off by controlling the volume of
transmitted data based on the information content in
the EEG signal. We consider the effect of different
system parameters in order to design an optimal
system.",
acknowledgement = ack-nhfb,
articleno = "3",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Naruse:2012:SDN,
author = "Makoto Naruse and Ferdinand Peper and Kouichi Akahane
and Naokatsu Yamamoto and Tadashi Kawazoe and Naoya
Tate and Motoichi Ohtsu",
title = "Skew Dependence of Nanophotonic Devices Based on
Optical Near-Field Interactions",
journal = j-JETC,
volume = "8",
number = "1",
pages = "4:1--4:??",
month = feb,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2093145.2093149",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Tue Feb 28 16:37:42 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "We examine the timing dependence of nanophotonic
devices based on optical excitation transfer via
optical near-field interactions at the nanometer scale.
We theoretically analyze the dynamic behavior of a
two-input nanophotonic switch composed of three quantum
dots based on a density matrix formalism while assuming
arrival-time differences, or skew, between the inputs.
The analysis reveals that the nanophotonic switch is
resistant to a skew longer than the input signal
duration, and the tolerance to skew is asymmetric with
respect to the two inputs. The skew dependence is also
experimentally examined based on near-field
spectroscopy of InGaAs quantum dots, showing good
agreement with the theory.",
acknowledgement = ack-nhfb,
articleno = "4",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Ye:2012:TBH,
author = "Yaoyao Ye and Jiang Xu and Xiaowen Wu and Wei Zhang
and Weichen Liu and Mahdi Nikdast",
title = "A Torus-Based Hierarchical Optical-Electronic
{Network-on-Chip} for Multiprocessor {System-on-Chip}",
journal = j-JETC,
volume = "8",
number = "1",
pages = "5:1--5:??",
month = feb,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2093145.2093150",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Tue Feb 28 16:37:42 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Networks-on-chip (NoCs) are emerging as a key on-chip
communication architecture for multiprocessor
systems-on-chip (MPSoCs). Optical communication
technologies are introduced to NoCs in order to empower
ultra-high bandwidth with low power consumption.
However, in existing optical NoCs, communication
locality is poorly supported, and the importance of
floorplanning is overlooked. These significantly limit
the power efficiency and performance of optical NoCs.
In this work, we address these issues and propose a
torus-based hierarchical hybrid optical-electronic NoC,
called THOE. THOE takes advantage of both electrical
and optical routers and interconnects in a hierarchical
manner. It employs several new techniques including
floorplan optimization, an adaptive power control
mechanism, low-latency control protocols, and hybrid
optical-electrical routers with a low-power optical
switching fabric.",
acknowledgement = ack-nhfb,
articleno = "5",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Manem:2012:DCM,
author = "H. Manem and J. Rajendran and G. S. Rose",
title = "Design Considerations for Multilevel {{CMOS\slash}
Nano} Memristive Memory",
journal = j-JETC,
volume = "8",
number = "1",
pages = "6:1--6:??",
month = feb,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2093145.2093151",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Tue Feb 28 16:37:42 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "With technology migration into nano and molecular
scales several hybrid CMOS/nano logic and memory
architectures have been proposed that aim to achieve
high device density with low power consumption. The
discovery of the memristor has further enabled the
realization of denser nanoscale logic and memory
systems by facilitating the implementation of
multilevel logic. This work describes the design of
such a multilevel nonvolatile memristor memory system,
and the design constraints imposed in the realization
of such a memory. In particular, the limitations on
load, bank size, number of bits achievable per device,
placed by the required noise margin for accurately
reading and writing the data stored in a device are
analyzed.",
acknowledgement = ack-nhfb,
articleno = "6",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Bhunia:2012:ISI,
author = "Swarup Bhunia and Darrin J. Young",
title = "Introduction to Special Issue on Implantable
Electronics",
journal = j-JETC,
volume = "8",
number = "2",
pages = "7:1--7:??",
month = jun,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2180878.2180879",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Sat Jun 23 12:02:51 MDT 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
acknowledgement = ack-nhfb,
articleno = "7",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Ko:2012:EHC,
author = "Wen H. Ko",
title = "Early History and Challenges of Implantable
Electronics",
journal = j-JETC,
volume = "8",
number = "2",
pages = "8:1--8:??",
month = jun,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2180878.2180880",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Sat Jun 23 12:02:51 MDT 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Implantable systems for biomedical research and
clinical care are now a flourishing field of activities
in academia as well as industrial institutions. The
broad field includes experimental explorations in
electronics, mechanical, chemical, and biological
components and systems, and the combination of all
these. Today virtually all implants involve both
electronic circuits and
micro-electro-mechanical-systems (MEMS). This article
offers a very brief glance back at the early history of
implant electronics in the period from the 1950s to the
1970s, by employing selected examples from the author's
research. This short review also discusses the
challenges of implantable electronics at present, and
suggests some potentially important trends in the
future research and development of implantable
microsystems. It is aimed as an introduction of
implantable/attached electronic systems to research
engineers that are interested in implantable systems as
a section of Biomedical Instrumentations.",
acknowledgement = ack-nhfb,
articleno = "8",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Salam:2012:ICL,
author = "Muhammad Tariqus Salam and Mohamad Sawan and Dang Khoa
Nguyen",
title = "Implantable Closed-Loop Epilepsy Prosthesis: Modeling,
Implementation and Validation",
journal = j-JETC,
volume = "8",
number = "2",
pages = "9:1--9:??",
month = jun,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2180878.2180881",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Sat Jun 23 12:02:51 MDT 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "In this article, we present an implantable closed-loop
epilepsy prosthesis, which is dedicated to
automatically detect seizure onsets based on
intracerebral electroencephalographic (icEEG)
recordings from intracranial electrode contacts and
provide an electrical stimulation feedback to the same
contacts in order to disrupt these seizures. A novel
epileptic seizure detector and a dedicated electrical
stimulator were assembled together with common
recording electrodes to complete the proposed
prosthesis. The seizure detector was implemented in
CMOS 0.18-$ \mu $ m by incorporating a new seizure
detection algorithm that models time-amplitude and
-frequency relationship in icEEG. The detector was
validated offline on ten patients with refractory
epilepsy and showed excellent performance for early
detection of seizures. The electrical stimulator, used
for suppressing the developing seizure, is composed of
two biphasic channels and was assembled with embedded
FPGA in a miniature PCB. The stimulator efficiency was
evaluated on cadaveric animal brain tissue in an in
vitro morphologic electrical model. Spatial
characteristics of the voltage distribution in cortex
were assessed in an attempt to identify optimal
stimulation parameters required to affect the suspected
epileptic focus. The experimental results suggest that
lower frequency stimulation parameters cause
significant amount of shunting of current through the
cerebrospinal fluid; however higher frequency
stimulation parameters produce effective spatial
voltage distribution with lower stimulation charge.",
acknowledgement = ack-nhfb,
articleno = "9",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Sharad:2012:LPA,
author = "Mrigank Sharad and Sumeet K. Gupta and Shriram
Raghunathan and Pedro P. Irazoqui and Kaushik Roy",
title = "Low-Power Architecture for Epileptic Seizure Detection
Based on Reduced Complexity {DWT}",
journal = j-JETC,
volume = "8",
number = "2",
pages = "10:1--10:??",
month = jun,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2180878.2180882",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Sat Jun 23 12:02:51 MDT 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "In this article, we present a low-power,
user-programmable architecture for discrete wavelet
transform (DWT) based epileptic seizure detection
algorithm. A simplified, low-pass filter (LPF)-only-DWT
technique is employed in which energy contents of
different frequency bands are obtained by subtracting
quasi-averaged, consecutive LPF outputs. Training phase
is used to identify the range of critical DWT
coefficients that are in turn used to set
patient-specific system level parameters for minimizing
power consumption. The proposed optimizations allow the
design to work at significantly lower power in the
normal operation mode. The system has been tested on
neural data obtained from kainate-treated rats. The
design was implemented in TSMC-65nm technology and
consumes less than 550-nW power at 250-mV supply.",
acknowledgement = ack-nhfb,
articleno = "10",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Majerus:2012:WUL,
author = "Steve J. A. Majerus and Steven L. Garverick and
Michael A. Suster and Paul C. Fletter and Margot S.
Damaser",
title = "Wireless, Ultra-Low-Power Implantable Sensor for
Chronic Bladder Pressure Monitoring",
journal = j-JETC,
volume = "8",
number = "2",
pages = "11:1--11:??",
month = jun,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2180878.2180883",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Sat Jun 23 12:02:51 MDT 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "The wireless implantable/intracavity micromanometer
(WIMM) system was designed to fulfill the unmet need
for a chronic bladder pressure sensing device in
urological fields such as urodynamics for diagnosis and
neuromodulation for bladder control. Neuromodulation in
particular would benefit from a wireless bladder
pressure sensor which could provide real-time pressure
feedback to an implanted stimulator, resulting in
greater bladder capacity while using less power. The
WIMM uses custom integrated circuitry, a MEMS
transducer, and a wireless antenna to transmit pressure
telemetry at a rate of 10 Hz. Aggressive power
management techniques yield an average current draw of
$ 9 \mu $A from a 3.6-Volt micro-battery, which
minimizes the implant size. Automatic pressure offset
cancellation circuits maximize the sensing dynamic
range to account for drifting pressure offset due to
environmental factors, and a custom telemetry protocol
allows transmission with minimum overhead. Wireless
operation of the WIMM has demonstrated that the
external receiver can receive the telemetry packets,
and the low power consumption allows for at least 24
hours of operation with a 4-hour wireless recharge
session.",
acknowledgement = ack-nhfb,
articleno = "11",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Huang:2012:IRD,
author = "Yu-Jie Huang and Hsin-Hung Liao and Pen-Li Huang and
Tao Wang and Yao-Joe Yang and Yao-Hong Wang and
Shey-Shi Lu",
title = "An Implantable Release-on-Demand {CMOS} Drug Delivery
{SoC} Using Electrothermal Activation Technique",
journal = j-JETC,
volume = "8",
number = "2",
pages = "12:1--12:??",
month = jun,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2180878.2180884",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Sat Jun 23 12:02:51 MDT 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "An implantable system-on-a-chip (SoC) integrating
controller/actuation circuitry and 8 individually
addressable drug reservoirs is proposed for on-demand
drug delivery. It is implemented by standard 0.35- \mu
m CMOS technology and post-IC processing. The post-IC
processing includes deposition of metallic membranes
(200{\AA} Pt/3000{\AA} Ti/200{\AA} Pt) to cap the drug
reservoirs, deep dry etching to carve drug reservoirs
in silicon as drug containers, and PDMS layer bonding
to enlarge the drug storage. Based on electrothermal
activation technique, drug releases can be precisely
controlled by wireless signals. The wireless
controller/actuation circuits including on-off keying
(OOK) receiver, microcontroller unit, clock generator,
power-on-reset circuit, and switch array are integrated
on the same chip, providing patients the ability of
remote drug activation and noninvasive therapy
modification. Implanted by minimally invasive surgery,
this SoC can be used for the precise drug dosing of
localized treatment, such as the cancer therapy, or the
immediate medication to some emergent diseases, such as
heart attack. In vitro experimental results show that
the reservoir content can be released successfully
through the rupture of the membrane which is appointed
by received wireless commands.",
acknowledgement = ack-nhfb,
articleno = "12",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Sun:2012:NMD,
author = "Zhenyu Sun and Xiang Chen and Yaojun Zhang and Hai Li
and Yiran Chen",
title = "Nonvolatile Memories as the Data Storage System for
Implantable {ECG} Recorder",
journal = j-JETC,
volume = "8",
number = "2",
pages = "13:1--13:??",
month = jun,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2180878.2180885",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Sat Jun 23 12:02:51 MDT 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "In this article, we propose a data storage system with
the emerging nonvolatile memory technologies used for
the implantable electrocardiography (ECG) recorder. The
proposed storage system can record the digitalized
real-time ECG waveforms continuously inside the
implantable device and export the stored data to
external reader periodically to obtain a long-term
backup. Spin transfer torque random access memory
(STT-RAM) and spintronic memristor are selected as the
storage elements for their nonvolatility, high density,
high reliability, low power consumption, good
scalability, and CMOS technology compatibility. The new
read and write schemes of STT-RAM and spintronic
memristors are presented and optimized to fit the
specific application scenario. The tradeoffs among data
accuracy, chip area, and read/write energy for the
different technologies are thoroughly analyzed and
compared. Our simulation results show the configuration
with a data sampling rate (e.g., 128 Hz) and a
quantization resolution (e.g., 12 bits) can record
18-hour real-time data within $ \approx 3.6$-mm$^2$
chip area when the data storage is built with
single-level cell (SLC) STT-RAMs. Daily energy
consumption is $ 5.46$ mJ. Utilizing the multilevel
cell (MLC) STT-RAMs or the spintronic memristors as the
storage elements can further reduce the chip area and
decrease energy dissipation.",
acknowledgement = ack-nhfb,
articleno = "13",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Mohanty:2012:SSN,
author = "Saraju P. Mohanty",
title = "Special section on new circuit and architecture-level
solutions for multidiscipline systems",
journal = j-JETC,
volume = "8",
number = "3",
pages = "14:1--14:??",
month = aug,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2287696.2287697",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Mon Aug 20 15:17:55 MDT 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
acknowledgement = ack-nhfb,
articleno = "14",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Srivastava:2012:CLV,
author = "Ashok Srivastava and Yao Xu and Yang Liu and Ashwani
K. Sharma and Clay Mayberry",
title = "{CMOS LC} voltage controlled oscillator design using
multiwalled and single-walled carbon nanotube wire
inductors",
journal = j-JETC,
volume = "8",
number = "3",
pages = "15:1--15:??",
month = aug,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2287696.2287698",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Mon Aug 20 15:17:55 MDT 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "We have utilized our Multiwalled Carbon NanoTube
(MWCNT) and Single-Walled Carbon NanoTube (SWCNT)
bundle interconnects model in a widely used $ \pi $
model to study the performances of MWCNT and SWCNT
bundle wire inductors and compared these with copper
(Cu) inductors. The calculation results show that the
Q-factors of Carbon NanoTube (CNT) wire (SWCNT bundle
and MWCNT) inductors are higher than that of the Cu
wire inductor. This is mainly due to much lower
resistance of CNT and negligible skin effect in carbon
nanotubes at higher frequencies. The application of CNT
wire inductor in LC VCO is also studied and the
Cadence/Spectre simulations show that VCOs with CNT
bundle wire inductors have significantly improved
performance such as the higher oscillation frequency
and lower phase noise due to their smaller resistances
and higher Q-factors. It is also noticed that CMOS LC
VCO using a SWCNT bundle wire inductor has better
performance when compared with the performance of LC
VCO using the MWCNT wire inductor due to its lower
resistance and higher Q-factor.",
acknowledgement = ack-nhfb,
articleno = "15",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Mahalingam:2012:DCS,
author = "Venkataraman Mahalingam and Nagarajan Ranganathan and
Ransford {Hyman, Jr.}",
title = "Dynamic clock stretching for variation compensation in
{VLSI} circuit design",
journal = j-JETC,
volume = "8",
number = "3",
pages = "16:1--16:??",
month = aug,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2287696.2287699",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Mon Aug 20 15:17:55 MDT 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "In the nanometer era, process, voltage, and
temperature variations are dominating circuit
performance, power, and yield. Over the past few years,
statistical optimization methods have been effective in
improving yield in the presence of uncertainty due to
process variations. However, statistical methods
overconsume resources, even in the absence of
variations. Hence, to facilitate a better
performance-power-yield trade-off, techniques that can
dynamically enable variation compensation are becoming
necessary. In this article, we propose a dynamic
technique that controls the instance of data capture in
critical path memory flops, by delaying the clock edge
trigger. The methodology employs a dynamic delay
detection circuit to identify the uncertainty in delay
due to variations and stretches the clock in the
destination flip-flops. The delay detection circuit
uses a latch and set of combinational gates to
dynamically detect and create the slack needed to
accommodate the delay due to variations. The Clock
Stretching Logic (CSL) is added only to paths, which
have a high probability of failure in the presence of
variations. The proposed methodology improves the
timing yield of the circuit without significant
overcompensation. The methodology approach was
simulated using Synopsys design tools for circuit
synthesis and Cadence tools for placement and routing
of the design. Extraction of parasitic of timing
information was parsed using Perl scripts and simulated
using a simulation program generated in C++.
Experimental results based on Monte-Carlo simulations
on benchmark circuits indicate considerable improvement
in timing yield with negligible area overhead.",
acknowledgement = ack-nhfb,
articleno = "16",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Roy:2012:CAL,
author = "Sudip Roy and Debasis Mitra and Bhargab B.
Bhattacharya and Krishnendu Chakrabarty",
title = "Congestion-aware layout design for high-throughput
digital microfluidic biochips",
journal = j-JETC,
volume = "8",
number = "3",
pages = "17:1--17:??",
month = aug,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2287696.2287700",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Mon Aug 20 15:17:55 MDT 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Potential applications of digital microfluidic (DMF)
biochips now include several areas of real-life
applications like environmental monitoring, water and
air pollutant detection, and food processing to name a
few. In order to achieve sufficiently high throughput
for these applications, several instances of the same
bioassay may be required to be executed concurrently on
different samples. As a straightforward implementation,
several identical biochips can be integrated on a
single substrate as a multichip to execute the assay
for various samples concurrently. Controlling
individual electrodes of such a chip by independent
pins may not be acceptable since it increases the cost
of fabrication. Thus, in order to keep the overall
pin-count within an acceptable bound, all the
respective electrodes of these individual pieces are
connected internally underneath the chip so that they
can be controlled with a single external control pin.
In this article, we present an orientation strategy for
layout of a multichip that reduces routing congestion
and consequently facilitates wire routing for the
electrode array. The electrode structure of the
individual pieces of the multichip may be either
direct-addressable or pin-constrained. The method also
supports a hierarchical approach to wire routing that
ensures scalability. In this scheme, the size of the
biochip in terms of the total number of electrodes may
be increased by a factor of four by increasing the
number of routing layers by only one. In general, for a
multichip with 4 $^n$ identical blocks, ( n + 1) layers
are sufficient for wire routing.",
acknowledgement = ack-nhfb,
articleno = "17",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Komerath:2012:RBP,
author = "Narayanan Komerath and Aravinda Kar",
title = "Retail beamed power using millimeter waves: Survey",
journal = j-JETC,
volume = "8",
number = "3",
pages = "18:1--18:??",
month = aug,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2287696.2287701",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Mon Aug 20 15:17:55 MDT 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Retail delivery of electric power through millimeter
waves is relevant in developing areas where the market
for communication devices outpaces the power grid
infrastructure. It is also a critical component of an
evolutionary path towards terrestrial and space-based
renewable power generation. Narrow-band power can be
delivered as focused beams to receivers near end-users,
from central power plants, rural distribution points,
UAVs, tethered aerostats, stratospheric airship
platforms, or space satellites. The article surveys the
available knowledge base on millimeter wave beamed
power delivery. It then considers design requirements
for a retail beamed power architecture, in the context
of rural India where power delivery is lagging behind
the demand growth for connectivity. A survey of
technology developments relevant to millimeter wave
beaming is conducted, and indicates that massive,
mass-produced solid-state arrays capable of achieving
good efficiency and cost effectiveness are possible in
the near term to enable such retail power beaming
architectures.",
acknowledgement = ack-nhfb,
articleno = "18",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Palaniswamy:2012:EHI,
author = "Ashok Kumar Palaniswamy and Spyros Tragoudas",
title = "An efficient heuristic to identify threshold logic
functions",
journal = j-JETC,
volume = "8",
number = "3",
pages = "19:1--19:??",
month = aug,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2287696.2287702",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Mon Aug 20 15:17:55 MDT 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "A fast method to identify the given Boolean function
as a threshold function with weight assignment is
introduced. It characterizes the function based on the
parameters that have been defined in the literature.
The proposed method is capable to quickly characterize
all functions that have less than eight inputs and has
been shown to operate fast for functions with as many
as forty inputs. Furthermore, comparisons with other
existing heuristic methods show huge increase in the
number of threshold functions identified, and drastic
reduction in time and complexity.",
acknowledgement = ack-nhfb,
articleno = "19",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Xu:2012:EPV,
author = "Hu Xu and Vasilis F. Pavlidis and Giovanni {De
Micheli}",
title = "Effect of process variations in {$3$D} global clock
distribution networks",
journal = j-JETC,
volume = "8",
number = "3",
pages = "20:1--20:??",
month = aug,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2287696.2287703",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Mon Aug 20 15:17:55 MDT 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "In three-dimensional (3D) integrated circuits, the
effect of process variations on clock skew differs from
2D circuits. The combined effect of inter-die and
intra-die process variations on the design of 3D clock
distribution networks is considered in this article. A
statistical clock skew model incorporating both the
systematic and random components of process variations
is employed to describe this effect. Two regular 3D
clock tree topologies are investigated and compared in
terms of clock skew variation. The statistical skew
model used to describe clock skew variations is
verified through Monte-Carlo simulations. The clock
skew is shown to change in different ways with the
number of planes forming the 3D IC and the clock
network architecture. Simulations based on a 45-nm CMOS
technology show that the maximum standard deviation of
clock skew can vary from 15 ps to 77 ps. Results
indicate that simply increasing the number of planes of
a 3D IC does not necessarily lead to lower skew
variation and higher operating frequencies. A
multigroup 3D clock tree topology is proposed to
effectively mitigate the variability of clock skew.
Tradeoffs between the investigated 3D clock
distribution networks and the number of planes
comprising a 3D circuit are discussed and related
design guidelines are offered. The skew variation in 3D
clock trees is also compared with the skew variation of
clock grids.",
acknowledgement = ack-nhfb,
articleno = "20",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Kursun:2012:STT,
author = "Eren Kursun and Jamil Wakil and Mukta Farooq and
Robert Hannon",
title = "Spatial and temporal thermal characterization of
stacked multicore architectures",
journal = j-JETC,
volume = "8",
number = "3",
pages = "21:1--21:??",
month = aug,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2287696.2287704",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Mon Aug 20 15:17:55 MDT 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Three-dimensional integration provides a new way of
performance growth for microprocessor architectures.
While a recent studies report promising performance
improvement numbers, majority of the processor stacking
options are thermally-limited. Elevated stack
temperatures have significant effect on the overall
energy efficiency and reliability of the processor;
they also limit the potential peak performance
improvement from the 3D implementation. Thermal
characteristics of 3D stacks differ from 2D processors
in various ways including: the nature of heat
dissipation throughout the stack, thermal conductivity
of the 3D structures such as micro-C4 layers, and
hotspot interactions among layers. The intensity of the
corresponding thermal problems is highly dependent on
the 3D technology, processor and stack parameters. In
this study we focus on spatial and temporal thermal
characteristics of 3D multicore architectures using
high-fidelity technology and processor models. Our
experimental results highlight the need for integrating
detailed thermal models in the design flow, starting
with the early design stages. In addition, the reduced
time constants and elevated on-chip temperatures
indicate faster response time requirements for dynamic
thermal management in processor stacking options.",
acknowledgement = ack-nhfb,
articleno = "21",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Liu:2012:RAP,
author = "Bao Liu and Xuemei Chen and Fiona Teshome",
title = "Resilient and adaptive performance logic",
journal = j-JETC,
volume = "8",
number = "3",
pages = "22:1--22:??",
month = aug,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2287696.2287705",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Mon Aug 20 15:17:55 MDT 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "As VLSI technology continues scaling, increasingly
significant parametric variations and increasingly
prevalent defects present unprecedented challenges to
VLSI design at nanometer scale. Specifically,
performance variability has hindered performance
scaling, while soft errors become an emerging problem
for logic computation at recent technology nodes. In
this article, we leverage the existing Totally
Self-Checking (TSC)/Strongly Fault-Secure (SFS) logic
design techniques, and propose Resilient and Adaptive
Performance (RAP) logic for maximum adaptive
performance and soft error resilience in nanoscale
computing. RAP logic clears all timing errors in the
absence of external soft errors, albeit at a higher
area/power cost compared with Razor logic. Our
experimental results further show that dual-rail static
(Domino) RAP logic outperforms alternative
Delay-Insensitive (DI) code-based static (Domino) RAP
logic with less area, higher performance, and lower
power consumption for the large test cases, and
achieves an average of 2.29(2.41)$ \times $ performance
boost, 2.12(1.91)$ \times $ layout area, and
2.38(2.34)$ \times $ power consumption compared with
the traditional minimum area static logic based on the
Nangate 45-nm open cell library.",
acknowledgement = ack-nhfb,
articleno = "22",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Chang:2012:PED,
author = "Kevin Chang and Sujay Deb and Amlan Ganguly and Xinmin
Yu and Suman Prasad Sah and Partha Pratim Pande and
Benjamin Belzer and Deukhyoun Heo",
title = "Performance evaluation and design trade-offs for
wireless network-on-chip architectures",
journal = j-JETC,
volume = "8",
number = "3",
pages = "23:1--23:??",
month = aug,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2287696.2287706",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Mon Aug 20 15:17:55 MDT 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Massive levels of integration are making modern
multicore chips all pervasive in several domains. High
performance, robustness, and energy-efficiency are
crucial for the widespread adoption of such platforms.
Networks-on-Chip (NoCs) have emerged as communication
backbones to enable a high degree of integration in
multicore Systems-on-Chip (SoCs). Despite their
advantages, an important performance limitation in
traditional NoCs arises from planar metal
interconnect-based multihop links with high latency and
power consumption. This limitation can be addressed by
drawing inspiration from the evolution of natural
complex networks, which offer great performance-cost
trade-offs. Analogous with many natural complex
systems, future multicore chips are expected to be
hierarchical and heterogeneous in nature as well. In
this article we undertake a detailed performance
evaluation for hierarchical small-world NoC
architectures where the long-range communications links
are established through the millimeter-wave wireless
communication channels. Through architecture-space
exploration in conjunction with novel power-efficient
on-chip wireless link design, we demonstrate that it is
possible to improve performance of conventional NoC
architectures significantly without incurring high area
overhead.",
acknowledgement = ack-nhfb,
articleno = "23",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Choi:2012:DQA,
author = "Byung-Soo Choi and Rodney {Van Meter}",
title = "A {$ \Theta (\sqrt n) $}-depth quantum adder on the
{$2$D NTC} quantum computer architecture",
journal = j-JETC,
volume = "8",
number = "3",
pages = "24:1--24:??",
month = aug,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2287696.2287707",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Mon Aug 20 15:17:55 MDT 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "In this work, we propose an adder for the
2-Dimensional Nearest-Neighbor, Two-Qubit gate,
Concurrent (2D NTC) architecture, designed to match the
architectural constraints of many quantum computing
technologies. The chosen architecture allows the layout
of logical qubits in two dimensions with {\&}sqrt; n
columns where each column has {\&}sqrt; n qubits and
the concurrent execution of one- and two-qubit gates
with nearest-neighbor interaction only. The proposed
adder works in three phases. In the first phase, the
first column generates the summation output and the
other columns do the carry-lookahead operations. In the
second phase, these intermediate values are propagated
from column to column, preparing for computation of the
final carry for each register position. In the last
phase, each column, except the first one, generates the
summation output using this column-level carry. The
depth and the number of qubits of the proposed adder
are $ \Theta (\sqrt n) $ and $ O(n) $, respectively.
The proposed adder executes faster than the adders
designed for the 1D NTC architecture when the length of
the input registers $n$ is larger than 51.",
acknowledgement = ack-nhfb,
articleno = "24",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Huang:2012:PDT,
author = "Jiale Huang and Minhao Zhu and Shengqi Yang and Pallav
Gupta and Wei Zhang and Steven M. Rubin and Gilda
Garret{\'o}n and Jin He",
title = "A physical design tool for carbon nanotube
field-effect transistor circuits",
journal = j-JETC,
volume = "8",
number = "3",
pages = "25:1--25:??",
month = aug,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2287696.2287708",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Mon Aug 20 15:17:55 MDT 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "In this article, we present a graphical Computer-Aided
Design (CAD) environment for the design, analysis, and
layout of Carbon NanoTube (CNT) Field-Effect Transistor
(CNFET) circuits. This work is motivated by the fact
that such a tool currently does not exist in the public
domain for researchers. Our tool has been integrated
within Electric a very powerful, yet free CAD system
for custom design of Integrated Circuits (ICs). The
tool supports CNFET schematic and layout entry, rule
checking, and HSpice/VerilogA netlist generation. We
provide users with a customizable CNFET technology
library with the ability to specify $ \lambda $ -based
design rules. We showcase the capabilities of our tool
by demonstrating the design of a large CNFET standard
cell and components library. Meanwhile, HSPICE
simulations also have been presented for cell library
characterization. We hope that the availability of this
tool will invigorate the CAD community to explore novel
ideas in CNFET circuit design.",
acknowledgement = ack-nhfb,
articleno = "25",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Pande:2012:ISI,
author = "Partha Pratim Pande and Amlan Ganguly",
title = "Introduction to the special issue on sustainable and
green computing systems",
journal = j-JETC,
volume = "8",
number = "4",
pages = "26:1--26:??",
month = oct,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2367736.2367737",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Wed Nov 28 17:25:59 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
acknowledgement = ack-nhfb,
articleno = "26",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Banerjee:2012:TNZ,
author = "Prithviraj Banerjee and Chandrakant Patel and Cullen
Bash and Amip Shah and Martin Arlitt",
title = "Towards a net-zero data center",
journal = j-JETC,
volume = "8",
number = "4",
pages = "27:1--27:??",
month = oct,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2367736.2367738",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Wed Nov 28 17:25:59 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "A world consisting of billions of service-oriented
client devices and thousands of data centers can
deliver a diverse range of services, from social
networking to management of our natural resources.
However, these services must scale in order to meet the
fundamental needs of society. To enable such scaling,
the total cost of ownership of the data centers that
host the services and comprise the vast majority of
service delivery costs will need to be reduced. As
energy drives the total cost of ownership of data
centers, there is a need for a new paradigm in design
and management of data centers that minimizes energy
used across their lifetimes, from ``cradle to cradle''.
This tutorial article presents a blueprint for a
``net-zero data center'': one that offsets any
electricity used from the grid via adequate on-site
power generation that gets fed back to the grid at a
later time. We discuss how such a data center addresses
the total cost of ownership, illustrating that contrary
to the oft-held view of sustainability as ``paying more
to be green'', sustainable data centers-built on a
framework that focuses on integrating supply and demand
management from end-to-end-can concurrently lead to
lowest cost and lowest environmental impact.",
acknowledgement = ack-nhfb,
articleno = "27",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Garg:2012:TDL,
author = "Siddharth Garg and Diana Marculescu and Radu
Marculescu",
title = "Technology-driven limits on runtime power management
algorithms for multiprocessor systems-on-chip",
journal = j-JETC,
volume = "8",
number = "4",
pages = "28:1--28:??",
month = oct,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2367736.2367739",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Wed Nov 28 17:25:59 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Runtime power management is a critical technique for
reducing the energy footprint of digital electronic
devices and enabling sustainable computing, since it
allows electronic devices to dynamically adapt their
power and energy consumption to meet performance
requirements. In this article, we consider the case of
MultiProcessor Systems-on-Chip (MPSoC) implemented
using multiple Voltage and Frequency Islands (VFIs)
relying on fine-grained Dynamic Voltage and Frequency
Scaling (DVFS) to reduce the system power dissipation.
In particular, we present a framework to theoretically
analyze the impact of three important technology-driven
constraints; (i) reliability-driven upper limits on the
maximum supply voltage; (ii) inductive noise-driven
constraints on the maximum rate of change of
voltage/frequency; and (iii) the impact of
manufacturing process variations on the performance of
DVFS control for multiple VFI MPSoCs. The proposed
analysis is general, in the sense that it is not bound
to a specific DVFS control algorithm, but instead
focuses on theoretically bounding the performance that
any DVFS controller can possibly achieve. Our
experimental results on real and synthetic benchmarks
show that in the presence of reliability- and
temperature-driven constraints on the maximum frequency
and maximum frequency increment, any DVFS control
algorithm will lose up to 87\% performance in terms of
the number of steps required to reach a reference
steady state. In addition, increasing process
variations can lead to up to 60\% of fabricated chips
being unable to meet the specified DVFS control
specifications, irrespective of the DVFS algorithm
used. Nonetheless, we note that although conventional
DVFS might become less effective with technology
scaling, it will continue to play an important role in
the context of emerging power management techniques,
for example, for massively parallel multiprocessor
systems where only a subset of cores can be turned on
at any given point of time due to total power
constraints.",
acknowledgement = ack-nhfb,
articleno = "28",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Ghidini:2012:EEM,
author = "Giacomo Ghidini and Sajal K. Das",
title = "Energy-efficient {Markov} chain-based duty cycling
schemes for greener wireless sensor networks",
journal = j-JETC,
volume = "8",
number = "4",
pages = "29:1--29:??",
month = oct,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2367736.2367740",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Wed Nov 28 17:25:59 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "To extend the lifetime of a wireless sensor network,
sensor nodes usually duty cycle between dormant and
active states. Duty cycling schemes are often evaluated
in terms of connection delay, connection duration, and
duty cycle. In this article, we show with experiments
on Sun SPOT sensors that duty cycling time (energy)
efficiency, that is, the ratio of time (energy)
employed in ancillary operations when switching from
and into deep sleep mode, is an important performance
metric too. We propose a novel randomized duty cycling
scheme based on Markov chains with the goal of (i)
reducing the connection delay, while maintaining a
given time (energy) efficiency, or (ii) keeping a
constant connection delay, while increasing the time
(energy) efficiency. Analytical and experimental
results demonstrate that the Markov chain-based scheme
can improve the performance in terms of connection
delay without affecting the time efficiency, or vice
versa, as opposed to the trade-off observed in
traditional schemes. We extend the proposed duty
cycling scheme to a partially randomized scheme, where
wireless nodes can switch into active state beyond
their schedules when their neighbors are active to
anticipate message forwarding. The analytical and
experimental results confirm the relationship between
connection delay and time efficiency also for this
scheme.",
acknowledgement = ack-nhfb,
articleno = "29",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Sego:2012:IDC,
author = "Landon H. Sego and Andr{\'e}s M{\'a}rquez and Andrew
Rawson and Tahir Cader and Kevin Fox and William I.
{Gustafson, Jr.} and Christopher J. Mundy",
title = "Implementing the data center energy productivity
metric",
journal = j-JETC,
volume = "8",
number = "4",
pages = "30:1--30:??",
month = oct,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2367736.2367741",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Wed Nov 28 17:25:59 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "As data centers proliferate in size and number, the
endeavor to improve their energy efficiency and
productivity is becoming increasingly important. We
discuss the properties of a number of the proposed
metrics of energy efficiency and productivity. In
particular, we focus on the Data Center Energy
Productivity (DCeP) metric, which is the ratio of
useful work produced by the data center to the energy
consumed performing that work. We describe our approach
for using DCeP as the principal outcome of a designed
experiment using a highly instrumented,
high-performance computing data center. We found that
DCeP was successful in clearly distinguishing different
operational states in the data center, thereby
validating its utility as a metric for identifying
configurations of hardware and software that would
improve (or even maximize) energy productivity. We also
discuss some of the challenges and benefits associated
with implementing the DCeP metric, and we examine the
efficacy of the metric in making comparisons within a
data center and among data centers.",
acknowledgement = ack-nhfb,
articleno = "30",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Anagnostopoulou:2012:BAM,
author = "Vlasia Anagnostopoulou and Susmit Biswas and Heba
Saadeldeen and Alan Savage and Ricardo Bianchini and
Tao Yang and Diana Franklin and Frederic T. Chong",
title = "Barely alive memory servers: Keeping data active in a
low-power state",
journal = j-JETC,
volume = "8",
number = "4",
pages = "31:1--31:??",
month = oct,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2367736.2367742",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Wed Nov 28 17:25:59 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Current resource provisioning schemes in Internet
services leave servers less than 50\% utilized almost
all the time. At this level of utilization, the
servers' energy efficiency is substantially lower than
at peak utilization. A solution to this problem could
be dynamically consolidating workloads into fewer
servers and turning others off. However, services
typically resist doing so, because of high response
times during reactivation in handling traffic spikes.
Moreover, services often want the memory and/or storage
of all servers to be readily available at all times. In
this article, we propose a family of barely alive
active low-power server states that facilitates both
fast reactivation and access to memory while in a
low-power state. We compare these states to previously
proposed active and idle states. In particular, we
investigate the impact of load bursts in each
energy-saving scheme. We also evaluate the additional
benefits of memory access under low-power states with a
study of a search service using a cooperative
main-memory cache. Finally, we propose a system that
combines a barely-alive state with the off state. We
find that the barely alive states can reduce service
energy consumption by up to 38\%, compared to an
energy-oblivious system. We also find that these energy
savings are consistent across a large parameter
space.",
acknowledgement = ack-nhfb,
articleno = "31",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Sheikh:2012:EPA,
author = "Hafiz Fahad Sheikh and Hengxing Tan and Ishfaq Ahmad
and Sanjay Ranka and Phanisekhar Bv",
title = "Energy- and performance-aware scheduling of tasks on
parallel and distributed systems",
journal = j-JETC,
volume = "8",
number = "4",
pages = "32:1--32:??",
month = oct,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2367736.2367743",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Wed Nov 28 17:25:59 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Enabled by high-speed networking in commercial,
scientific, and government settings, the realm of high
performance is burgeoning with greater amounts of
computational and storage resources. Large-scale
systems such as computational grids consume a
significant amount of energy due to their massive
sizes. The energy and cooling costs of such systems are
often comparable to the procurement costs over a year
period. In this survey, we will discuss allocation and
scheduling algorithms, systems, and software for
reducing power and energy dissipation of workflows on
the target platforms of single processors, multicore
processors, and distributed systems. Furthermore,
recent research achievements will be investigated that
deal with power and energy efficiency via different
power management techniques and application scheduling
algorithms. The article provides a comprehensive
presentation of the architectural, software, and
algorithmic issues for energy-aware scheduling of
workflows on single, multicore, and parallel
architectures. It also includes a systematic taxonomy
of the algorithms developed in the literature based on
the overall optimization goals and characteristics of
applications.",
acknowledgement = ack-nhfb,
articleno = "32",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Kant:2012:EDC,
author = "Krishna Kant and Muthukumar Murugan and David H. C.
Du",
title = "Enhancing data center sustainability through
energy-adaptive computing",
journal = j-JETC,
volume = "8",
number = "4",
pages = "33:1--33:??",
month = oct,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2367736.2367744",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Wed Nov 28 17:25:59 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "The sustainability concerns of Information Technology
(IT) go well beyond energy-efficient computing and
require techniques for minimizing environmental impact
of IT infrastructure over its entire life-cycle.
Traditionally, IT infrastructure is overdesigned at all
levels from chips to entire data centers and ecosystem;
the paradigm explored in this article is to replace
overdesign with rightsizing coupled with smarter
control, henceforth referred to as Energy-Adaptive
Computing or EAC. The article lays out the challenges
of EAC in various environments in terms of the
adaptation of the workload and the infrastructure to
cope with energy and cooling deficiencies. The article
then focuses on implementing EAC in a data center
environment, and addresses the problem of simultaneous
energy demand and energy supply regulation at multiple
levels, work, from servers to the entire data center.
The proposed control scheme adapts the assignments of
tasks to servers in a way that can cope with the
varying energy limitations. The article also presents
some experimental results to show how the scheme can
continue to meet Quality of Service (QoS) requirements
of tasks under energy limitations.",
acknowledgement = ack-nhfb,
articleno = "33",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Abbasi:2012:DGD,
author = "Zahra Abbasi and Tridib Mukherjee and Georgios
Varsamopoulos and Sandeep K. S. Gupta",
title = "{DAHM}: a green and dynamic {Web} application hosting
manager across geographically distributed data
centers",
journal = j-JETC,
volume = "8",
number = "4",
pages = "34:1--34:??",
month = oct,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2367736.2367745",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Wed Nov 28 17:25:59 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Dynamic Application Hosting Management (DAHM) is
proposed for geographically distributed data centers,
which decides on the number of active servers and on
the workload share of each data center. DAHM achieves
cost-efficient application hosting by taking into
account: (i) the spatio-temporal variation of energy
cost, (ii) the data center computing and cooling energy
efficiency, (iii) the live migration cost, and (iv) any
SLA violations due to migration overhead or network
delay. DAHM is modeled as fixed-charge min-cost flow
and mixed integer programming for stateless and
stateful applications, respectively, and it is shown
NP-hard. We also develop heuristic algorithms and
prove, when applications are stateless and servers have
an identical power consumption model, that the
approximation ratio on the minimum total cost is
bounded by the number of data centers. Further, the
heuristics are evaluated in a simulation study using
realistic parameter data; compared to a
performance-oriented application assignment, that is,
hosting at the data center with the least delay, the
potential cost savings of DAHM reaches 33\%. The
savings come from reducing the total number of active
servers as well as leveraging the cost efficiency of
data centers. Through the simulation study, the article
further explores how relaxing the delay requirement for
a small fraction of users can increase the cost savings
of DAHM.",
acknowledgement = ack-nhfb,
articleno = "34",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Srinivasan:2013:NAF,
author = "S. Srinivasan and V. Kamakoti and A. Bhattacharya",
title = "A Novel Algorithm for Fast Synthesis of {DNA} Probes
on Microarrays",
journal = j-JETC,
volume = "9",
number = "1",
pages = "1:1--1:??",
month = feb,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2422094.2422095",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Wed Feb 20 16:42:57 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "DNA microarrays are used extensively for biochemical
analysis that includes genomics and drug discovery.
This increased usage demands large microarrays, thus
complicating their computer aided design (CAD) and
manufacturing methodologies. One such time-consuming
design problem is to minimize the border length of
masks used during the manufacture of microarrays. From
the manufacturing point of view the border length of
masks is one of the crucial parameters determining the
reliability of the microarray. This article presents a
novel algorithm for synthesis (placement and embedding)
of microarrays, which consumes significantly less time
than the best algorithm reported in the literature,
while maintaining the quality (border length of masks)
of the result. The proposed technique uses only a part
of each probe to decide on the placement and the
remaining parts for deciding on the embedding sequence.
This is in contrast to the earlier methods that
considered the entire probe for both placement and
embedding. The second novelty of the proposed technique
is the preclassification (prior to placement and
embedding) of probes based on their prefixes. This
decreases the complexity of the problem of deciding the
next probe to be placed from that involving computation
of Hamming distance between all probes (as used in
earlier approaches) to the one involving searching of
nonempty cells on a constant size grid array. The
proposed algorithm is $ 43 \times $ faster than the
best reported in the literature for the case of
synthesizing a microarray with 250,000 probes and
further exhibits linear behavior in terms of
computation time for larger microarrays.",
acknowledgement = ack-nhfb,
articleno = "1",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Maftei:2013:MBS,
author = "Elena Maftei and Paul Pop and Jan Madsen",
title = "Module-Based Synthesis of Digital Microfluidic
Biochips with Droplet-Aware Operation Execution",
journal = j-JETC,
volume = "9",
number = "1",
pages = "2:1--2:??",
month = feb,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2422094.2422096",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Wed Feb 20 16:42:57 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Microfluidic biochips represent an alternative to
conventional biochemical analyzers. A digital biochip
manipulates liquids not as continuous flow, but as
discrete droplets on a two-dimensional array of
electrodes. Several electrodes are dynamically grouped
to form a virtual device, on which operations are
executed by moving the droplets. So far, researchers
have ignored the locations of droplets inside devices,
considering that all the electrodes forming the device
are occupied throughout the operation execution. In
this article, we consider a droplet-aware execution of
microfluidic operations, which means that we know the
exact position of droplets inside the modules at each
time-step. We propose a Tabu Search-based metaheuristic
for the synthesis of digital biochips with
droplet-aware operation execution. Experimental results
show that our approach can significantly reduce the
application completion time, allowing us to use smaller
area biochips and thus reduce costs.",
acknowledgement = ack-nhfb,
articleno = "2",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Peper:2013:BCF,
author = "Ferdinand Peper and Jia Lee and Josep Carmona and
Jordi Cortadella and Kenichi Morita",
title = "{Brownian} Circuits: Fundamentals",
journal = j-JETC,
volume = "9",
number = "1",
pages = "3:1--3:??",
month = feb,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2422094.2422097",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Wed Feb 20 16:42:57 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Random fluctuations will be a major factor interfering
with the operation of nanometer scale electronic
devices. This article presents circuit architectures
that can exploit such fluctuations, if signals have a
particle-like (discrete, token-based) character. We
define an abstract circuit primitive that, though
lacking functionality when used with fluctuation-free
signals, becomes universal when fluctuations are
allowed. Key to the power of a signal's fluctuations is
the ability to explore the state space of a circuit.
This ability is used to resolve deadlock situations,
which could otherwise only be averted by increased
design complexity. The results in this article suggest
that in the design of future computers, signal
fluctuations, rather than being an impediment to be
avoided at any cost, may be an important ingredient to
achieve efficient operation.",
acknowledgement = ack-nhfb,
articleno = "3",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Ghavami:2013:DAR,
author = "Behnam Ghavami and Mohsen Raji and Hossein Pedram and
Mehdi B. Tahoori",
title = "Design and Analysis of a Robust Carbon Nanotube-Based
Asynchronous Primitive Circuit",
journal = j-JETC,
volume = "9",
number = "1",
pages = "4:1--4:??",
month = feb,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2422094.2422098",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Wed Feb 20 16:42:57 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Carbon Nanotube Field Effect Transistors (CNFETs) show
great promise as extensions to silicon CMOS. However,
CNFET-based circuits will face great fabrication
challenges that will translate into important parameter
variations and decreased reliability. Hence,
asynchronous logic, which is intrinsically more robust
to variability, seems an ideal and perhaps unavoidable
choice for digital circuits in CNFET technology. This
article presents the results on the design and analysis
of a CNFET-based implementation of an asynchronous
circuit primitive: the Muller C-element. Using a CNFET
SPICE model, we evaluate the robustness of CNFET-based
C-element in the presence of CNT fabrication-related
nonidealities. We investigate a quantitative evaluation
of how timing variability impacts the functionality of
a C-element and then, extract the necessary delay
constraints of the C-element circuit from the signal
transition graph specification. Considering the large
degrees of spatial correlation observed between the
CNFETs fabricated on directionally grown CNTs, a layout
technique is exploited to overcome the robustness
challenges of a CNFET-based C-element. Extensive Monte
Carlo simulations on the proposed technique have
demonstrated the effectiveness of the proposed
CNFET-based C-element by improving approximately 50X in
its robustness in expense of 65\% area, 47\% delay, and
56\% power consumption overheads. Experimental results
indicate that implementation of some CNFET-based Quasi
Delay Insensitive (QDI) benchmark circuits using the
proposed C-element results in significant robustness
improvement with negligible power and throughput
overheads. As a promising step toward CNFET-based
giga-scale integrated circuits, this article shows that
the asynchronous logic is an effective approach to
design robust integrated circuits in CNFET technology
with inherent extreme physical variations.",
acknowledgement = ack-nhfb,
articleno = "4",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Chen:2013:SAR,
author = "Yung-Chih Chen and Soumya Eachempati and Chun-Yao Wang
and Suman Datta and Yuan Xie and Vijaykrishnan
Narayanan",
title = "A Synthesis Algorithm for Reconfigurable
Single-Electron Transistor Arrays",
journal = j-JETC,
volume = "9",
number = "1",
pages = "5:1--5:??",
month = feb,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2422094.2422099",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Wed Feb 20 16:42:57 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Reducing power consumption has become one of the
primary challenges in chip design, and therefore
significant efforts are being devoted to find holistic
solutions on power reduction from the device level up
to the system level. Among a plethora of low power
devices that are being explored, single-electron
transistors (SETs) at room temperature are particularly
attractive. Although prior work has proposed a binary
decision diagram-based reconfigurable logic
architecture using SETs, it lacks an automatic
synthesis algorithm for the architecture. Consequently,
in this work, we develop a product-term-based approach
that synthesizes a logic circuit by mapping all its
product terms into the SET architecture. The
experimental results show the effectiveness and
efficiency of the proposed approach on a set of MCNC
benchmarks.",
acknowledgement = ack-nhfb,
articleno = "5",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Tang:2013:TCT,
author = "Aoxiang Tang and Niraj K. Jha",
title = "Thermal Characterization of Test Techniques for
{FinFET} and {$3$D} Integrated Circuits",
journal = j-JETC,
volume = "9",
number = "1",
pages = "6:1--6:??",
month = feb,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2422094.2422100",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Wed Feb 20 16:42:57 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Power consumption has become a very important
consideration during integrated circuit (IC) design and
test. During test, it can far exceed the values reached
during normal operation and, thus, lead to temperatures
above the allowed threshold. Without appropriate
temperature reduction, permanent damage may be caused
to the IC or invalid test results may be obtained.
FinFET is a double-gate field-effect transistor
(DG-FET) that was introduced commercially in 2012. Due
to the vertical nature of FinFETs and, hence, weaker
ability to dissipate heat, this problem is likely to
get worse for FinFET circuits. Another technology
rapidly gaining popularity is 3D IC integration.
Unfortunately, the compact nature of a multidie 3D IC
is likely to aggravate the temperature-during-test
problem even further. Hence, before temperature-aware
test methodologies can be developed, it is important to
thermally analyze both FinFET and 3D circuits under
test. In this article, we present a methodology for
thermal characterization of various test techniques,
such as scan and built-in self-test (BIST), for FinFET
and 3D ICs. FinFET thermal characterization makes use
of a FinFET standard cell library that is characterized
with the help of the University of Florida double-gate
(UFDG) SPICE model. Thermal profiles for circuits under
test are produced by ISAC2 from University of Colorado
for FinFET circuits and HotSpot from University of
Virginia for 3D ICs. Experimental results indicate that
high temperatures result under BIST and much less often
under scan, and that both power consumption and test
application time should be reduced to lower the
temperature of circuits under test, just reducing the
power consumption is not enough.",
acknowledgement = ack-nhfb,
articleno = "6",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Wang:2013:HRD,
author = "Shuo Wang and Jianwei Dai and Lei Wang",
title = "Hybrid Redundancy for Defect Tolerance in Molecular
Crossbar Memory",
journal = j-JETC,
volume = "9",
number = "1",
pages = "7:1--7:??",
month = feb,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2422094.2422101",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Wed Feb 20 16:42:57 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Nano/molecular technologies have emerged as the
potential fabrics for building future integrated
systems. However, due to the imperfect fabrication
process, these extremely scaled devices are vulnerable
to a large number of defects and transient faults.
Memory systems, which are the primary application
targeted by these technologies, are particularly
exposed to this problem due to the ultra-high
integration density and elevated error sensitivity. In
this article, we propose a defect-tolerant technique,
referred to as hybrid redundancy allocation, for the
design of molecular crossbar memory systems. By using
soft redundancy (runtime exploitation of memory
spatial/temporal locality) in combination with hardware
redundancy (spare memory cells), the proposed technique
can achieve better error management at a low cost as
compared with conventional techniques. Simulation
results demonstrate the significant improvement in
defect tolerance, efficiency, and scalability of the
proposed technique.",
acknowledgement = ack-nhfb,
articleno = "7",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Narayanan:2013:VNF,
author = "Pritish Narayanan and Michael Leuchtenburg and Jorge
Kina and Prachi Joshi and Pavan Panchapakeshan and Chi
On Chui and C. Andras Moritz",
title = "Variability in Nanoscale Fabrics: Bottom-up Integrated
Analysis and Mitigation",
journal = j-JETC,
volume = "9",
number = "1",
pages = "8:1--8:??",
month = feb,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2422094.2422102",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Wed Feb 20 16:42:57 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Emerging nanodevice-based architectures will be
impacted by parameter variation in conjunction with
high defect rates. Variations in key physical
parameters are caused by manufacturing imprecision as
well as fundamental atomic scale randomness. In this
article, the impact of parameter variation on nanoscale
computing fabrics is extensively studied through a
novel integrated methodology across device, circuit and
architectural levels. This integrated approach enables
to study in detail the impact of physical parameter
variation across all fabric layers. A final
contribution of the article includes novel techniques
to address this impact. The variability framework,
while generic, is explored extensively on the Nanoscale
Application Specific Integrated Circuits (NASICs)
nanowire fabric. For variation of $ \sigma = 10 $ in
key physical parameters, the on current is found to
vary by up to 3.5X. Circuit-level delay shows up to
118\% deviation from nominal. Monte Carlo simulations
using an architectural simulator found 67\%
nanoprocessor chips to operate below nominal
frequencies due to variation. New built-in variation
mitigation and fault-tolerance schemes, leveraging
redundancy, asymmetric delay paths and biased voting
schemes, were developed and evaluated to mitigate these
effects. They are shown to improve performance by up to
7.5X on a nanoscale processor design with variation,
and improve performance in designs relying on
redundancy for defect tolerance, without variation
assumed. Techniques show up to 3.8X improvement in
effective-yield performance products even at a high
12\% defect rate. The suite of techniques provides a
design space across key system-level metrics such as
performance, yield and area.",
acknowledgement = ack-nhfb,
articleno = "8",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Liang:2013:EWB,
author = "Jiale Liang and Stanley Yeh and S. Simon Wong and
H.-S. Philip Wong",
title = "Effect of Wordline\slash Bitline Scaling on the
Performance, Energy Consumption, and Reliability of
Cross-Point Memory Array",
journal = j-JETC,
volume = "9",
number = "1",
pages = "9:1--9:??",
month = feb,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2422094.2422103",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Wed Feb 20 16:42:57 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "The impact of wordline/bitline metal wire scaling on
the write/read performance, energy consumption, speed,
and reliability of the cross-point memory array is
quantitatively studied for technology nodes down to
single-digit nm. The impending resistivity increase in
the Cu wires is found to cause significant decrease of
both write and read window margins at the regime when
electron surface scattering and grain boundary
scattering are substantial. At deeply-scaled device
dimensions, the wire energy dissipation and wire
latency become comparable to or even exceed the
intrinsic values of memory cells. The large current
density flowing through the wordlines/bitlines raises
additional reliability concerns for the cross-point
memory array. All these issues are exacerbated at
smaller memory resistance values and larger memory
array sizes. They thereby impose strict constraints on
the memory device design and preclude the realization
of large-scale cross-point memory array with minimum
feature sizes beyond the 10 nm node. A rethink in the
design methodology of cross-point memory to incorporate
and mitigate the scaling effects of wordline/bitline is
necessary. Possible solutions include the use of memory
wires with better conductivity and scalability, memory
arrays with smaller partition sizes, and memory
elements with larger resistance values and resistance
ratios.",
acknowledgement = ack-nhfb,
articleno = "9",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Paul:2013:ISI,
author = "Bipul C. Paul and Arijit Raychowdhury",
title = "Introduction to the special issue on memory
technologies",
journal = j-JETC,
volume = "9",
number = "2",
pages = "10:1--10:??",
month = may,
year = "2013",
CODEN = "????",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Sat Jun 1 11:19:09 MDT 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
acknowledgement = ack-nhfb,
articleno = "10",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Yang:2013:MDC,
author = "J. Joshua Yang and R. Stanley Williams",
title = "Memristive devices in computing system: Promises and
challenges",
journal = j-JETC,
volume = "9",
number = "2",
pages = "11:1--11:??",
month = may,
year = "2013",
CODEN = "????",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Sat Jun 1 11:19:09 MDT 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Memristive devices with a simple structure are not
only very small but also very versatile, which makes
them an ideal candidate used for the next generation
computing system in the post-Si era. The working
mechanism of the devices and a family of nanodevices
built based on this working mechanism are introduced
first followed by some proposed applications of these
novel devices. The promises and challenges of these
devices are then discussed, together with the
significant progresses made recently in dealing with
these challenges.",
acknowledgement = ack-nhfb,
articleno = "11",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Jackson:2013:NES,
author = "Bryan L. Jackson and Bipin Rajendran and Gregory S.
Corrado and Matthew Breitwisch and Geoffrey W. Burr and
Roger Cheek and Kailash Gopalakrishnan and Simone Raoux
and Charles T. Rettner and Alvaro Padilla and Alex G.
Schrott and Rohit S. Shenoy and B{\"u}lent N. Kurdi and
Chung H. Lam and Dharmendra S. Modha",
title = "Nanoscale electronic synapses using phase change
devices",
journal = j-JETC,
volume = "9",
number = "2",
pages = "12:1--12:??",
month = may,
year = "2013",
CODEN = "????",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Sat Jun 1 11:19:09 MDT 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "The memory capacity, computational power,
communication bandwidth, energy consumption, and
physical size of the brain all tend to scale with the
number of synapses, which outnumber neurons by a factor
of 10,000. Although progress in cortical simulations
using modern digital computers has been rapid, the
essential disparity between the classical von Neumann
computer architecture and the computational fabric of
the nervous system makes large-scale simulations
expensive, power hungry, and time consuming. Over the
last three decades, CMOS-based neuromorphic
implementations of ``electronic cortex'' have emerged
as an energy efficient alternative for modeling
neuronal behavior. However, the key ingredient for
electronic implementation of any self-learning
system-programmable, plastic Hebbian synapses scalable
to biological densities-has remained elusive. We
demonstrate the viability of implementing such
electronic synapses using nanoscale phase change
devices. We introduce novel programming schemes for
modulation of device conductance to closely mimic the
phenomenon of Spike Timing Dependent Plasticity (STDP)
observed biologically, and verify through simulations
that such plastic phase change devices should support
simple correlative learning in networks of spiking
neurons. Our devices, when arranged in a crossbar array
architecture, could enable the development of
synaptronic systems that approach the density ($
\approx 10^{11} $ synapses per sq cm) and energy
efficiency (consuming $ \approx 1 $ pJ per synaptic
programming event) of the human brain.",
acknowledgement = ack-nhfb,
articleno = "12",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Apalkov:2013:STT,
author = "Dmytro Apalkov and Alexey Khvalkovskiy and Steven
Watts and Vladimir Nikitin and Xueti Tang and Daniel
Lottis and Kiseok Moon and Xiao Luo and Eugene Chen and
Adrian Ong and Alexander Driskill-Smith and Mohamad
Krounbi",
title = "Spin-transfer torque magnetic random access memory
{(STT-MRAM)}",
journal = j-JETC,
volume = "9",
number = "2",
pages = "13:1--13:??",
month = may,
year = "2013",
CODEN = "????",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Sat Jun 1 11:19:09 MDT 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Spin-transfer torque magnetic random access memory
(STT-MRAM) is a novel, magnetic memory technology that
leverages the base platform established by an existing
100+nm node memory product called MRAM to enable a
scalable nonvolatile memory solution for advanced
process nodes. STT-MRAM features fast read and write
times, small cell sizes of 6F$^2$ and potentially even
smaller, and compatibility with existing DRAM and SRAM
architecture with relatively small associated cost
added. STT-MRAM is essentially a magnetic multilayer
resistive element cell that is fabricated as an
additional metal layer on top of conventional CMOS
access transistors. In this review we give an overview
of the existing STT-MRAM technologies currently in
research and development across the world, as well as
some specific discussion of results obtained at Grandis
and with our foundry partners. We will show that
in-plane STT-MRAM technology, particularly the DMTJ
design, is a mature technology that meets all
conventional requirements for an STT-MRAM cell to be a
nonvolatile solution matching DRAM and/or SRAM drive
circuitry. Exciting recent developments in
perpendicular STT-MRAM also indicate that this type of
STT-MRAM technology may reach maturity faster than
expected, allowing even smaller cell size and product
introduction at smaller nodes.",
acknowledgement = ack-nhfb,
articleno = "13",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Mojumder:2013:DPS,
author = "Niladri N. Mojumder and Xuanyao Fong and Charles
Augustine and Sumeet K. Gupta and Sri Harsha Choday and
Kaushik Roy",
title = "Dual pillar spin-transfer torque {MRAMs} for low power
applications",
journal = j-JETC,
volume = "9",
number = "2",
pages = "14:1--14:??",
month = may,
year = "2013",
CODEN = "????",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Sat Jun 1 11:19:09 MDT 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Electron-spin based data storage for on-chip memories
has the potential for ultra-high density, low power
consumption, very high endurance, and reasonably low
read/write latency. In this article, we discuss the
design challenges associated with spin-transfer torque
(STT) MRAM in its state-of-the-art configuration. We
propose an alternative bit cell configuration and three
new genres of magnetic tunnel junction (MTJ) structures
to improve STT-MRAM bit cell stabilities, write
endurance, and reduce write energy consumption. The
proposed multi-port, multi-pillar MTJ structures offer
the unique possibility of electrical and spatial
isolation of memory read and write. In order to realize
ultralow power under process variations, we propose
device, bit-cell and architecture level design
techniques. Such design alternatives at multiple levels
of design abstraction has been found to achieve
substantially enhanced robustness, density, reliability
and low power as compared to their charge-based
counterparts for future embedded applications.",
acknowledgement = ack-nhfb,
articleno = "14",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Chatterjee:2013:EAS,
author = "Subho Chatterjee and Sayeef Salahuddin and Satish
Kumar and Saibal Mukhopadhyay",
title = "Electrothermal analysis of spin-transfer-torque random
access memory arrays",
journal = j-JETC,
volume = "9",
number = "2",
pages = "15:1--15:??",
month = may,
year = "2013",
CODEN = "????",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Sat Jun 1 11:19:09 MDT 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Spin Transfer Torque RAM (STTRAM) is a promising
candidate for fast, scalable, high-density, nonvolatile
memory in nanometer technology. However, relatively
high write current density and small volume of the
memory device indicate the possibility of significant
self-heating in the STTRAM structure. This article
performs a critical analysis of the self-heating
induced temperature variations in STTRAM. We perform a
3D finite volume method based study to characterize
self-heating effect in a single cell. The analysis is
extended for STTRAM arrays by developing a
computationally efficient RC compact model based
thermal analyzer. The analysis shows that self-heating
can results in considerable increase in both
steady-state value and transient change in temperature
of individual cells. The effect is less pronounced at
the array level and depends on the activity level, that
is, number of active cells within an array size. The
analysis further illustrates that self-heating
negatively impacts electrical reliability metrics
namely, read margin and detection accuracy; degrades
cell performance; and modulates energy dissipation.",
acknowledgement = ack-nhfb,
articleno = "15",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Chen:2013:CCB,
author = "Yiran Chen and Weng-Fai Wong and Hai Li and Cheng-Kok
Koh and Yaojun Zhang and Wujie Wen",
title = "On-chip caches built on multilevel spin-transfer
torque {RAM} cells and its optimizations",
journal = j-JETC,
volume = "9",
number = "2",
pages = "16:1--16:??",
month = may,
year = "2013",
CODEN = "????",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Sat Jun 1 11:19:09 MDT 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "It has been predicted that a processor's caches could
occupy as much as 90\% of chip area a few technology
nodes from the current ones. In this article, we
investigate the use of multilevel spin-transfer torque
RAM (STT-RAM) cells in the design of processor caches.
We start with examining the access (read and write)
scheme for multilevel cell (MLC) STT-RAM from a circuit
design perspective, detailing the read and write
circuits. Compared to traditional SRAM caches, a
multilevel cell (MLC) STT-RAM cache design is denser,
fast, and requires less energy. However, a number of
critical architecture-level issues remain to be solved
before MLC STT-RAM technology can be deployed in
processor caches. We shall offer solutions to the issue
of bit encoding as well as tackle the write endurance
problem. In particular, the latter has been neglected
in previous works on STT-RAM caches. We propose a set
remapping scheme that can potentially prolong the
lifetime of a MLC STT-RAM cache by 80$ \times $ on
average. Furthermore, a method for recovering the
performance that may be lost in some applications due
to set remapping is proposed. The impacts of process
variations of the MLC STT-RAM cell on the robustness of
the memory hierarchy is also discussed, together with
various enhancement techniques, namely, ECC and design
redundancy.",
acknowledgement = ack-nhfb,
articleno = "16",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Thapliyal:2013:DER,
author = "Himanshu Thapliyal and Nagarajan Ranganathan",
title = "Design of efficient reversible logic-based binary and
{BCD} adder circuits",
journal = j-JETC,
volume = "9",
number = "3",
pages = "17:1--17:??",
month = sep,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2491682",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Tue Oct 1 18:20:25 MDT 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Reversible logic is gaining significance in the
context of emerging technologies such as quantum
computing since reversible circuits do not lose
information during computation and there is one-to-one
mapping between the inputs and outputs. In this work,
we present a class of new designs for reversible binary
and BCD adder circuits. The proposed designs are
primarily optimized for the number of ancilla inputs
and the number of garbage outputs and are designed for
possible best values for the quantum cost and delay. In
reversible circuits, in addition to the primary inputs,
some constant input bits are used to realize different
logic functions which are referred to as ancilla inputs
and are overheads that need to be reduced. Further, the
garbage outputs which do not contribute to any useful
computations but are needed to maintain reversibility
are also overheads that need to be reduced in
reversible designs. First, we propose two new designs
for the reversible ripple carry adder: (i) one with no
input carry$ c_0 $ and no ancilla input bits, and (ii)
one with input carry$ c_0 $ and no ancilla input bits.
The proposed reversible ripple carry adder designs with
no ancilla input bits have less quantum cost and logic
depth (delay) compared to their existing counterparts
in the literature. In these designs, the quantum cost
and delay are reduced by deriving designs based on the
reversible Peres gate and the TR gate. Next, four new
designs for the reversible BCD adder are presented
based on the following two approaches: (i) the addition
is performed in binary mode and correction is applied
to convert to BCD when required through detection and
correction, and (ii) the addition is performed in
binary mode and the result is always converted using a
binary to BCD converter. The proposed reversible binary
and BCD adders can be applied in a wide variety of
digital signal processing applications and constitute
important design components of reversible computing.",
acknowledgement = ack-nhfb,
articleno = "17",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Lee:2013:CIP,
author = "Woo Hyung Lee and Pinaki Mazumder",
title = "Color image processing with multi-peak resonant
tunneling diodes",
journal = j-JETC,
volume = "9",
number = "3",
pages = "18:1--18:??",
month = sep,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2503128",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Tue Oct 1 18:20:25 MDT 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "The article introduces a novel approach to color image
processing that utilizes multi-peak resonant tunneling
diodes for encoding color information in quantized
states of the diodes. The Multi-Peak Resonant Tunneling
Diodes (MPRTDs) are organized as a two-dimensional
array of vertical pillars which are locally connected
by programmable passive and active elements with a view
to realizing a wide variety of color image processing
functions such as quantization, color extraction, image
smoothing, edge detection, and line detection. In order
to process color information in the input images, two
different methods for color representation schemes have
been used: one using color mapping and the other using
direct RGB representation. Finally, the article uses
HSPICE simulation methods for the nestlist of the
proposed RTD-based nanoarchitecture in order to verify
a candidate of image functions by using the
afore-mentioned representation methods.",
acknowledgement = ack-nhfb,
articleno = "18",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Bobba:2013:CTP,
author = "Shashikanth Bobba and Ashutosh Chakraborty and Olivier
Thomas and Perrine Batude and Giovanni de Micheli",
title = "Cell transformations and physical design techniques
for {$3$D} monolithic integrated circuits",
journal = j-JETC,
volume = "9",
number = "3",
pages = "19:1--19:??",
month = sep,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2491675",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Tue Oct 1 18:20:25 MDT 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "3D Monolithic Integration (3DMI), also termed as
sequential integration, is a potential technology for
future gigascale circuits. In 3DMI technology the 3D
contacts, connecting different active layers, are in
the order of few 100nm. Given the advantage of such
small contacts, 3DMI enables fine-grain (gate-level)
partitioning of circuits. In this work we present three
cell transformation techniques for standard cell-based
ICs with 3DMI technology. As a major contribution of
this work, we propose a design flow comprising of a
cell transformation technique, cell-on-cell stacking,
and a physical design technique ({CELONCEL$_{PD}$} )
aimed at placing cells transformed with cell-on-cell
stacking. We analyze and compare various cell
transformation techniques for 3DMI technology without
disrupting the regularity of the IC design flow. Our
experiments demonstrate the effectiveness of CELONCEL
design technique, yielding us an area reduction of
37.5\%, 16.2\% average reduction in wirelength, and
6.2\% average improvement in overall delay, compared
with a 2D case when benchmarked across various designs
in 45nm technology node.",
acknowledgement = ack-nhfb,
articleno = "19",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Tang:2013:DSE,
author = "Aoxiang Tang and Niraj K. Jha",
title = "Design space exploration of {FinFET} cache",
journal = j-JETC,
volume = "9",
number = "3",
pages = "20:1--20:??",
month = sep,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2491678",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Tue Oct 1 18:20:25 MDT 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Integration of cache on-chip has significantly
improved the performance of modern processors. The
relentless demand for ever-increasing performance has
led to the need to increase the cache capacity and
number of cache levels. However, the performance
improvement is accompanied by an increase in chip's
power dissipation, requiring the use of more expensive
cooling technologies to ensure chip reliability and
long product life. The emergence of FinFETs as the
technology of choice for high-performance computing
poses new challenges to processor designers. With the
introduction of new features in FinFETs, for example,
independently controllable back gates, researchers have
proposed several innovative memory cells that can
reduce leakage power significantly, making the
integration of a larger cache more practical. In this
article, we comprehensively evaluate and compare the
performance, power consumption (both dynamic and
leakage), area, and temperature of different FinFET
SRAM caches by exploring common configurations with
varying cache size, block size, associativity, and
number of banks. We evaluate caches based on four
well-known FinFET SRAM cells: Pass-Gate FeedBack
(PGFB), Row-based Back-Gate Biasing (RBGB), 8T, and 4T.
We show how the caches can be simulated at
self-consistent temperatures (at which leakage and
temperature are in equilibrium). Drowsy and decay
caches are two well-known leakage reduction techniques.
We implement them in the context of FinFET caches to
investigate their impact. We show that the RBGB
cell-based cache is far superior in leakage and
Power-Delay Product (PDP) to those based on the other
three cells, sometimes by an order of magnitude. This
superiority is maintained even when drowsy or decay
leakage reduction techniques are applied to caches
based on the other three cells, but not to the one
based on the RBGB cell. This significantly diminishes
the importance of drowsy or decay cache techniques, at
least when the RBGB cell is used.",
acknowledgement = ack-nhfb,
articleno = "20",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Zamani:2013:IFV,
author = "Masoud Zamani and Hanieh Mirzaei and Mehdi B.
Tahoori",
title = "{ILP} formulations for variation\slash defect-tolerant
logic mapping on crossbar nano-architectures",
journal = j-JETC,
volume = "9",
number = "3",
pages = "21:1--21:??",
month = sep,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2491680",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Tue Oct 1 18:20:25 MDT 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Several emerging nano-technologies, including crossbar
nano-architectures, have recently been studied as
possible replacement or supplement to CMOS technology
in the future. However, extreme process variation and
high failure rates, mainly due to atomic device sizes,
are major challenges for crossbar nano-architectures.
This article presents variation- and defect-tolerant
logic mapping on crossbar nano-architectures. Since
variation/defect-aware mapping is an NP-hard problem,
we introduce a set of Integer Linear Programming (ILP)
formulations to effectively solve the problem in a
reasonable time. The proposed ILP formulations can be
used for both diode-based and FET-based crossbars.
Experimental results on benchmark circuits show that
our approach can reduce the critical-path delay 39\%
compared to the Simulated Annealing (SA) method. It can
also successfully achieve 97\% defect-free mapping with
40\% defect density. It can tolerate process variations
to meet timing constraints in 95\% of the cases,
compared to only 77\% achieved by SA.",
acknowledgement = ack-nhfb,
articleno = "21",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Sun:2013:EVC,
author = "Guangyu Sun and Eren Kursun and Jude A. Rivers and
Yuan Xie",
title = "Exploring the vulnerability of {CMPs} to soft errors
with {$3$D} stacked nonvolatile memory",
journal = j-JETC,
volume = "9",
number = "3",
pages = "22:1--22:??",
month = sep,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2491679",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Tue Oct 1 18:20:25 MDT 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Improving the vulnerability to soft errors is one of
the important design goals for future architecture
design of Chip-MultiProcessors (CMPs). In this study,
we explore the soft error characteristics of CMPs with
3D stacked NonVolatile Memory (NVM), in particular, the
Spin-Transfer Torque Random Access Memory (STT-RAM),
whose cells are immune to radiation-induced soft errors
and do not have endurance problems. We use 3D stacking
as an enabler for modular integration of STT-RAM
memories with minimum disruption in the baseline
processor design flow, while providing further
interconnection and capacity advantages. We take an
in-depth look at alternative replacement schemes to
explore the soft error resilience benefits and design
trade-offs of 3D stacked STT-RAM and capture the
multivariable optimization challenges microprocessor
architectures face. We propose a vulnerability metric,
with respect to the instruction and data in the core
pipeline and through the cache hierarchy, to present a
comprehensive system evaluation with respect to
reliability, performance, and power consumption for our
CMP architectures. Our experimental results show that,
for the average workload, replacing memories with an
STT-RAM alternative significantly mitigates soft errors
on-chip, improves the performance by 14.15\%, and
reduces power consumption by 13.44\%.",
acknowledgement = ack-nhfb,
articleno = "22",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Yang:2013:NAC,
author = "Shengqi Yang and Wenping Wang and Mark Hagan and Wei
Zhang and Pallav Gupta and Yu Cao",
title = "{NBTI}-aware circuit node criticality computation",
journal = j-JETC,
volume = "9",
number = "3",
pages = "23:1--23:??",
month = sep,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2491681",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Tue Oct 1 18:20:25 MDT 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "For sub-65nm technology nodes, Negative Bias
Temperature Instability (NBTI) has become a primary
limiting factor of circuit lifetime. During the past
few years, researchers have spent considerable effort
on accurate modeling and characterization of circuit
delay degradation caused by NBTI at different design
levels. The search for techniques and methodologies
which can aid in effectively minimizing the NBTI effect
on circuit delay is still underway. In this work, we
present the usage of node criticality computation to
drive NBTI-aware timing analysis and optimization.
Circuits that have undergone this optimization flow
show strong resistance to NBTI delay degradation. For
the first time, this work proposes a node criticality
computation algorithm under an NBTI-aware timing
analysis and optimization framework. Our work provides
answers to the following yet unaddressed questions: (a)
what is the definition of node criticality in a circuit
under the NBTI effect? (b) how do we identify the
critical nodes that, once protected, will be immune to
NBTI timing degradation? and (c) what are the NBTI
effect attenuation approaches? Experimental results
indicate that by protecting the critical nodes found by
our proposed methodology, circuit delay degradation can
be reduced by up to 50\%. Combined with peak
temperature reduction, the delay degradation can be
further improved.",
acknowledgement = ack-nhfb,
articleno = "23",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Wettin:2013:CNE,
author = "Paul Wettin and Anuroop Vidapalapati and Amlan Gangul
and Partha Pratim Pande",
title = "Complex network-enabled robust wireless
network-on-chip architectures",
journal = j-JETC,
volume = "9",
number = "3",
pages = "24:1--24:??",
month = sep,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2491676",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Tue Oct 1 18:20:25 MDT 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "The Network-on-Chip (NoC) paradigm has emerged as a
scalable interconnection infrastructure for modern
multicore chips. However, with growing levels of
integration, the traditional NoCs suffer from high
latency and energy dissipation in on-chip data transfer
due to conventional multihop metal/dielectric-based
interconnects. Three-dimensional integration, on-chip
photonics, RF, and wireless links have been proposed as
radical low-power and low-latency alternatives to the
conventional planar wire-based designs. Wireless NoCs
with Carbon NanoTube (CNT) antennas are shown to
outperform traditional wire-based NoCs significantly in
achievable data rate and energy dissipation. However,
such emerging and transformative technologies will be
prone to high levels of failures due to various issues
related to manufacturing challenges and integration. On
the other hand, several naturally occurring complex
networks such as colonies of microbes and the World
Wide Web are known to be inherently robust against high
rates of failures and harsh environments. This article
advocates adoption of such complex network-based
architectures to minimize the effect of wireless link
failures on the performance of the NoC. Through
cycle-accurate simulations it is shown that the
wireless NoC architectures inspired by natural complex
networks perform better than their conventional wired
counterparts even in the presence of high degrees of
link failures. We demonstrate the robustness of the
proposed wireless NoC architecture by incorporating
both uniform and application-specific traffic
patterns.",
acknowledgement = ack-nhfb,
articleno = "24",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Zhang:2013:DTU,
author = "Xuehui Zhang and Andrew Ferraiuolo and Mohammad
Tehranipoor",
title = "Detection of {Trojans} using a combined ring
oscillator network and off-chip transient power
analysis",
journal = j-JETC,
volume = "9",
number = "3",
pages = "25:1--25:??",
month = sep,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2491677",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Tue Oct 1 18:20:25 MDT 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Verifying the trustworthiness of Integrated Circuits
(ICs) is of utmost importance, as hardware Trojans may
destroy ICs bound for critical applications. A novel
methodology combining on-chip structure with external
current measurements is proposed to verify whether or
not an IC is Trojan free. This method considers
Trojans' impact on neighboring cells and on the entire
IC's power consumption, and effectively localizes the
measurement of dynamic power. To achieve this, we
develop a new on-chip ring oscillator network structure
distributed across the entire chip and place each ring
oscillator's components in different rows of a
standard-cell design. By developing novel statistical
data analysis, the effect of process variations on the
ICs' transient power will be separated from the effect
of Trojans. Simulation results using 90nm technology
and experimental results on Xilinx Spartan-6 FPGAs
demonstrate the efficiency of our proposed method.",
acknowledgement = ack-nhfb,
articleno = "25",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Guiducci:2013:ISI,
author = "Carlotta Guiducci",
title = "Introduction to Special Issue on Bioinformatics",
journal = j-JETC,
volume = "9",
number = "4",
pages = "26:1--26:??",
month = nov,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2536744.2536745",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Wed Nov 27 17:50:48 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
acknowledgement = ack-nhfb,
articleno = "26",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Piovesan:2013:ERP,
author = "Damiano Piovesan and Giuseppe Profiti and Pier Luigi
Martelli and Piero Fariselli and Rita Casadio",
title = "Extended and Robust Protein Sequence Annotation over
Conservative Nonhierarchical Clusters: The Case Study
of the {ABC} Transporters",
journal = j-JETC,
volume = "9",
number = "4",
pages = "27:1--27:??",
month = nov,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2504729",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Wed Nov 27 17:50:48 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Genome annotation is one of the most important issues
in the genomic era. The exponential growth rate of
newly sequenced genomes and proteomes urges the
development of fast and reliable annotation methods,
suited to exploit all the information available in
curated databases of protein sequences and structures.
To this aim we developed BAR+, the Bologna Annotation
Resource. The basic notion is that sequences with high
identity value to a counterpart can inherit the same
function/s and structure, if available. As a case study
we describe how the ATP-binding domain of the ABC
transporters can be found and modeled in over 30,000
new sequences not annotated before. We also mapped into
BAR+ all the ABC transporters listed in the Transporter
Classification DataBase and found that within our
environment annotation could be extended to another
256,866 sequences.",
acknowledgement = ack-nhfb,
articleno = "27",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Abate:2013:ILH,
author = "Francesco Abate and Andrea Acquaviva and Elisa Ficarra
and Enrico Macii",
title = "Integration of Literature with Heterogeneous
Information for Genes Correlation Scoring",
journal = j-JETC,
volume = "9",
number = "4",
pages = "28:1--28:??",
month = nov,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2504728",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Wed Nov 27 17:50:48 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Determining the correlation between biomedical terms
is a powerful instrument to help scientist research
activity, both to understand experimental results and
to design new ones. In particular, a great potential
comes from the integration of the many heterogeneous
information sources currently available on the Web. In
this article we focus on the correlation between genes
and biological processes. In this context, we present a
methodology for integrating information from biomedical
literature with other heterogeneous types of structured
information. In particular, the information sources
integrated in this work are PubMed abstracts, pathway
databases, and NCI thesaurus definitions. The
integration is performed at the semantic analysis level
using a customized approach we developed to modulate
the impact of the different sources on the correlation
score. We report the results of a study concerning the
impact of the information integration on the
correlation score and of the user-level parameters we
introduced to modulate the impact of pathway data or
NCI definitions with respect to biomedical literature
information, depending on the context of the search. To
evaluate the methodology, we performed correlation
measures on six biological processes and nine genes by
comparing the results with and without the integration
of pathways and NCI definitions.",
acknowledgement = ack-nhfb,
articleno = "28",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Graziano:2013:HVB,
author = "Mariagrazia Graziano and Stefano Frache and Maurizio
Zamboni",
title = "A Hardware Viewpoint on Biosequence Analysis: What's
Next?",
journal = j-JETC,
volume = "9",
number = "4",
pages = "29:1--29:??",
month = nov,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2504774",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Wed Nov 27 17:50:48 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Biosequence alignment recently received an increasing
support from both commodity and dedicated hardware
platforms. Processing capabilities are constantly
rising, but still not satisfying the limitless
requirements of this application. We give an insight on
the contribution to this need that can possibly be
expected from emerging technology devices and
architectures, focusing as an example on nanofabrics
based on silicon nanowires. By varying a few parameters
we explore the solution space, and demonstrate with
proper figures of merit how this family of beyond CMOS
structures could be considered as the effective
disruptive technology for biosequence analysis
applications.",
acknowledgement = ack-nhfb,
articleno = "29",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Venken:2013:SBM,
author = "Lyn Venken and Kathleen Marchal and Jos Vanderleyden",
title = "Synthetic Biology and Microdevices: a Powerful
Combination",
journal = j-JETC,
volume = "9",
number = "4",
pages = "30:1--30:??",
month = nov,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2504775",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Wed Nov 27 17:50:48 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Recent developments demonstrate that the combination
of microbiology with micro- and nanoelectronics is a
successful approach to develop new miniaturized sensing
devices and other technologies. In the last decade,
there has been a shift from the optimization of the
abiotic components, for example, the chip, to the
improvement of the processing capabilities of cells
through genetic engineering. The synthetic biology
approach will not only give rise to systems with new
functionalities, but will also improve the robustness
and speed of their response towards applied signals. To
this end, the development of new genetic circuits has
to be guided by computational design methods that
enable to tune and optimize the circuit response. As
the successful design of genetic circuits is highly
dependent on the quality and reliability of its
composing elements, intense characterization of
standard biological parts will be crucial for an
efficient rational design process in the development of
new genetic circuits. Microengineered devices can
thereby offer a new analytical approach for the study
of complex biological parts and systems. By summarizing
the recent techniques in creating new synthetic
circuits and in integrating biology with microdevices,
this review aims at emphasizing the power of combining
synthetic biology with microfluidics and
microelectronics.",
acknowledgement = ack-nhfb,
articleno = "30",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Editors:2014:ISI,
author = "Editors",
title = "Introduction to special issue on reliability and
device degradation in emerging technologies",
journal = j-JETC,
volume = "10",
number = "1",
pages = "1:1--1:??",
month = jan,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2543749.2543750",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Tue Jan 14 19:15:04 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
acknowledgement = ack-nhfb,
articleno = "1",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Kufluoglu:2014:RMN,
author = "Haldun K{\"u}fl{\"u}oglu and Cathy Chancellor and Min
Chen and Claude Cirba and Vijay Reddy",
title = "Recovery modeling of negative bias temperature
instability {(NBTI)} for {SPICE}-compatible circuit
aging simulators",
journal = j-JETC,
volume = "10",
number = "1",
pages = "2:1--2:??",
month = jan,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2517648",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Tue Jan 14 19:15:04 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "A feasible computational framework that enables
improved predictability of NBTI degradation within
commercially available tools is discussed. The NBTI
model is used for real-time circuit operation where
recovery is present. The complementary nature of
implementation is readily incorporated into existing
model extraction and verification tools. The method
provides significantly enhanced accuracy in simulations
when compared to circuit data, yet retains practicality
and flexibility.",
acknowledgement = ack-nhfb,
articleno = "2",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Arasu:2014:RIL,
author = "Senthil Arasu and Mehrdad Nourani and Vijay Reddy and
John M. {Carulli Jr.} and Gautam Kapila and Min Chen",
title = "Reliability improvement of logic and clock paths in
power-efficient designs",
journal = j-JETC,
volume = "10",
number = "1",
pages = "3:1--3:??",
month = jan,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2543749.2543751",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Tue Jan 14 19:15:04 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Performance degradation due to transistor aging is a
significant impediment to high-performance IC design
due to increasing concerns of reliability mechanisms
such as negative-bias-temperature-instability (NBTI).
The concern only grows with technology scaling as the
effects of positive bias temperature instability (PBTI)
is becoming prominent in future technologies and
compounding with the effects of NBTI. Although aging of
transistor is inevitable and the magnitude of
degradation due to aging varies depending upon the
context. Specifically, in power-efficient systems
designs, the logic and clock paths are susceptible to
static stress resulting in peak degradation due to BTI
occurrence when clock is gated. In this article, we
present the reliability impact of making systems power
efficient and propose a design-for-reliability
methodology that can be used in conjunction with
low-power design techniques to alleviate the stress
conditions caused by rendering circuits in idle state.
The technique- BTI-Refresh, is shown to be applicable
to both logic and clock paths alike and focuses on
preventing prolonged static stress using periodic
refreshes to achieve alternating stress. The mechanism
is shown to integrate seamlessly into the design at
gate-level without requiring any architectural or
RT-level changes. Using ISCAS benchmarks and
Kogge-Stone-Adder circuits, it is shown to reduce the
aging effect in logic path delay due to static stress
by up to 50\% with negligible area and power overhead.
BTI-Refresh is extended to clock-paths to prevent
pulse-width degradation due to static aging and with
minimal clock-skew.",
acknowledgement = ack-nhfb,
articleno = "3",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Sun:2014:WAC,
author = "Jin Sun and Roman Lysecky and Karthik Shankar and
Avinash Kodi and Ahmed Louri and Janet Roveda",
title = "Workload assignment considering {NBTI} degradation in
multicore systems",
journal = j-JETC,
volume = "10",
number = "1",
pages = "4:1--4:??",
month = jan,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2539124",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Tue Jan 14 19:15:04 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "With continuously shrinking technology, reliability
issues such as Negative Bias Temperature Instability
(NBTI) has resulted in considerable degradation of
device performance, and eventually the short
mean-time-to-failure (MTTF) of the whole multicore
system. This article proposes a new workload balancing
scheme based on device-level fractional NBTI model to
balance the workload among active cores while relaxing
stressed ones. Starting with NBTI-induced threshold
voltage degradation, we define a concept of Capacity
Rate (CR) as an indication of one core's ability to
accept workload. Capacity rate captures core's
performance variability in terms of delay and power
metrics under the impact of NBTI aging. The proposed
workload balancing framework employs the capacity rates
as workload constraints, applies a Dynamic Zoning (DZ)
algorithm to group cores into zones to process task
flows, and then uses Dynamic Task Scheduling (DTS) to
allocate tasks in each zone with balanced workload and
minimum communication cost. Experimental results on a
64-core system show that by allowing a small part of
the cores to relax over a short time period, the
proposed methodology improves multicore system yield
(percentage of core failures) by 20\%, while extending
MTTF by 30\% with insignificant degradation in
performance (less than 3\%).",
acknowledgement = ack-nhfb,
articleno = "4",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Chabi:2014:RLA,
author = "Djaafar Chabi and Damien Querlioz and Weisheng Zhao
and Jacques-Olivier Klein",
title = "Robust learning approach for neuro-inspired nanoscale
crossbar architecture",
journal = j-JETC,
volume = "10",
number = "1",
pages = "5:1--5:??",
month = jan,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2539123",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Tue Jan 14 19:15:04 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Scaling beyond CMOS require a new combination of
computing paradigm and new devices. In this context,
memristor are often considered as best candidate to
implement efficiently synapses in hardware neural
networks. In this article, we analyze the impact of
memristor parameter variability. We build an analytical
model of the global reliability at the crossbar level.
It is based on a supervised learning method with
multilayer and redundancy extensions. Comparisons with
Monte Carlo simulations of small neural network
validate our analytical model. It can be used to
extrapolate directly the reliability of large-scale
neural system. Our extrapolations show that high defect
rate and important parameter variability can be handle
efficiency with a moderate amount of redundancy.",
acknowledgement = ack-nhfb,
articleno = "5",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Frache:2014:NAM,
author = "Stefano Frache and Mariagrazia Graziano and Maurizio
Zamboni",
title = "Nanoarray architectures multilevel simulation",
journal = j-JETC,
volume = "10",
number = "1",
pages = "6:1--6:??",
month = jan,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2541882",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Tue Jan 14 19:15:04 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Density and regularity are deemed as the major
advantages of nanoarray architectures based on
nanowires. Literature demonstrated that proper
reliability analyzes must be performed and solutions
have to be devised to improve nanoarrays yield. Their
complexity and high-fault probability claim for
specific design automation tools able to explore
circuit solutions, performance and fault-tolerant
approaches. We envision a simulator conceived to carry
on characterizations in terms of logic behavior,
defect-induced output error rate assessment, switching
activity, power and timing performance. Though already
existing for traditional technology, a simulator based
on specific technological and topological tiled
nanoarray descriptions, and conceived to join both
device and architecture levels, has never been
attempted at the degree of accuracy we present. Our
contribution is twofold. First, marking a difference
with respect to the state of the art, we developed an
algorithm based on an event-driven engine which works
at switch level and is not simply built on top of cost
functions evaluations. The straightforward advantage is
the possibility to follow the evolution of dynamic
control sequences throughout all the inner components
of the nanoarray, and, as a consequence, to obtain
circuit level characterization as a projection of the
real internal parameters. Second, we added to our
simulator the capability to inject faults with specific
statistical distributions associated to the nanoarray
topology. Here we extract output error rates and yield
for one of the possible nanoarray structures proposed
in literature, the NASIC. Results specificity and
accuracy demonstrate the simulator trustworthiness, its
effectiveness for extensive nanoarrays characterization
and its suitability as a foundation for both higher
architectural and lower device simulation levels. The
aim of this work, then, is to provide insights into the
intertwined relation between actual technology and
circuit design for these emerging fabrics, and, as a
consequence, to clarify how defects and variability
affect circuits and systems performance.",
acknowledgement = ack-nhfb,
articleno = "6",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Avritzer:2014:ISI,
author = "Alberto Avritzer and Tadashi Dohi",
title = "Introduction to special issue on {WoSAR 2011}",
journal = j-JETC,
volume = "10",
number = "1",
pages = "7:1--7:??",
month = jan,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2543749.2543752",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Tue Jan 14 19:15:04 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
acknowledgement = ack-nhfb,
articleno = "7",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Cotroneo:2014:SSA,
author = "Domenico Cotroneo and Roberto Natella and Roberto
Pietrantuono and Stefano Russo",
title = "A survey of software aging and rejuvenation studies",
journal = j-JETC,
volume = "10",
number = "1",
pages = "8:1--8:??",
month = jan,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2539117",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Tue Jan 14 19:15:04 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Software aging is a phenomenon plaguing many
long-running complex software systems, which exhibit
performance degradation or an increasing failure rate.
Several strategies based on the proactive rejuvenation
of the software state have been proposed to counteract
software aging and prevent failures. This survey
article provides an overview of studies on Software
Aging and Rejuvenation (SAR) that have appeared in
major journals and conference proceedings, with respect
to the statistical approaches that have been used to
forecast software aging phenomena and to plan
rejuvenation, the kind of systems and aging effects
that have been studied, and the techniques that have
been proposed to rejuvenate complex software systems.
The analysis is useful to identify key results from SAR
research, and it is leveraged in this article to
highlight trends and open issues.",
acknowledgement = ack-nhfb,
articleno = "8",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Zhao:2014:SRS,
author = "Jing Zhao and Yuliang Jin and Kishor S. Trivedi and
Rivalino {Matias Jr.} and Yanbin Wang",
title = "Software rejuvenation scheduling using accelerated
life testing",
journal = j-JETC,
volume = "10",
number = "1",
pages = "9:1--9:??",
month = jan,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2539118",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Tue Jan 14 19:15:04 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "A number of studies have reported the phenomenon of
``Software aging'', caused by resource exhaustion and
characterized by progressive software performance
degradation. In this article, we carry out an
experimental study of software aging and rejuvenation
for an on-line bookstore application, following the
standard configuration of TPC-W benchmark. While real
website is used for the bookstore, the clients are
emulated. In order to reduce the time to application
failures caused by memory leaks, we use the accelerated
life testing (ALT) approach. We then select the Weibull
time to failure distribution at normal level, to be
used in a semi-Markov process, to compute the optimal
software rejuvenation trigger interval. Since the
validation of optimal rejuvenation trigger interval
with emulated browsers will take an inordinate long
time, we develop a simulation model to validate the ALT
experimental results, and also estimate the
steady-state availability to cross-validate the results
of the semi-Markov availability model.",
acknowledgement = ack-nhfb,
articleno = "9",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Machida:2014:JCT,
author = "Fumio Machida and Victor F. Nicola and Kishor S.
Trivedi",
title = "Job completion time on a virtualized server with
software rejuvenation",
journal = j-JETC,
volume = "10",
number = "1",
pages = "10:1--10:??",
month = jan,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2539121",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Tue Jan 14 19:15:04 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "This article analyzes the completion time of a job
running on a virtualized server subject to software
aging and rejuvenation in a virtual machine monitor
(VMM). A job running on the server may be interrupted
by virtual machine (VM) failure, VMM failure or VMM
rejuvenation. The job interruption is categorized as
either preemptive-repeat ( prt ), in which case the
interrupted job needs to restart from the beginning, or
preemptive-resume ( prs ), in which case the job
resumes execution from the point of interruption. Using
a semi-Markov process (SMP) to model the server
behavior, the steady-state server availability is
computed and the theory developed in Kulkarni et al.
[1987] is used to obtain the Laplace--Stieltjes
transform (LST) of the job completion time. In the
numerical experiments, we introduce four types of aging
behavior of VMM. The effectiveness of VMM rejuvenation
on job completion time is discussed in association with
the type of interruption it causes and the VMM aging
type. With our parameter settings, VMM rejuvenation
with prs job interruption improves the performance of
job execution regardless of the aging type, with
performance degradation is taken into account.",
acknowledgement = ack-nhfb,
articleno = "10",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Araujo:2014:SAE,
author = "Jean Araujo and Rubens Matos and Vandi Alves and Paulo
Maciel and F. Vieira de Souza and Rivalino {Matias Jr.}
and Kishor S. Trivedi",
title = "Software aging in the {Eucalyptus} cloud computing
infrastructure: Characterization and rejuvenation",
journal = j-JETC,
volume = "10",
number = "1",
pages = "11:1--11:??",
month = jan,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2539122",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Tue Jan 14 19:15:04 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "The need for high reliability, availability and
performance has significantly increased in modern
applications, that handle rapidly growing demands while
providing uninterruptible services. Cloud computing
systems fundamentally provide access to large pools of
data and computational resources. Eucalyptus is a
software framework largely used to implement private
clouds and hybrid-style Infrastructure as a Service. It
implements the Amazon Web Service (AWS) API, allowing
interoperability with other AWS-based services. This
article investigates the software aging effects in the
Eucalyptus framework, considering workloads composed of
intensive requests for remote storage attachment and
virtual machine instantiations. We found problems that
may be harmful to system dependability and performance,
specifically regarding to RAM memory and swap space
exhaustion, besides highly excessive CPU utilization by
the virtual machines. We also present an approach that
applies time series analysis to schedule rejuvenation,
so as to reduce the downtime by predicting the proper
moment to perform the rejuvenation. We experimentally
evaluate our approach using an Eucalyptus test bed. The
results show that our approach achieves higher
availability, when compared to a threshold-triggered
rejuvenation method based on continuous monitoring of
resources utilization.",
acknowledgement = ack-nhfb,
articleno = "11",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Chen:2014:CRP,
author = "Jifeng Chen and Shuo Wang and Mohammad Tehranipoor",
title = "Critical-reliability path identification and delay
analysis",
journal = j-JETC,
volume = "10",
number = "2",
pages = "12:1--12:??",
month = feb,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2564926",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Fri Feb 28 17:06:25 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Circuit reliability analysis at the presilicon stage
has become vital for sub-45nm technology designs in
particular, due to aging effects, such as Negative Bias
Temperature Instability (NBTI) and Hot Carrier
Injection (HCI). To avoid potential reliability hazards
in the postsilicon stage, current large-scale designs
for commercial implementation overpessimistically
analyze circuit aging under assumed worst-case workload
in order not to violate the corner cases even for low
possibilities, thus introducing unnecessary margin in
the design timing analysis. The major issue is lack of
an effective aging analysis method applicable to large
designs with low CPU runtime, which is mainly due to:
(1) conventional reliability tools are extremely
time-consuming for circuit-level timing analysis and
thus are not practical for large designs; (2)
mathematical models developed to expedite the process
are not accurate due to the high complexity of aging
effects. In this article, a comprehensive analysis is
presented to highlight the importance of each aging
parameter. Then, a novel methodology is developed based
on current commercial reliability tools to guarantee
its high accuracy on circuit-level aging analysis.
Existing proven low-level mathematical models are
further enhanced to extensively speed up a higher level
analysis by taking advantage of the explicit
intermediate conditions stored in a pregenerated lookup
table. Our results indicate $ \geq 244 \times $
improved computational efficiency, $ \leq 5 \% $
relative error, and $ \leq 0.7 \% $ absolute error
compared with commercial reliability analysis tools
(e.g., HSPICE MOSRA).",
acknowledgement = ack-nhfb,
articleno = "12",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Gladshtein:2014:DBP,
author = "Michael Gladshtein",
title = "Delay-based processing-in-wire for design of {QCA}
serial decimal arithmetic units",
journal = j-JETC,
volume = "10",
number = "2",
pages = "13:1--13:??",
month = feb,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2564927",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Fri Feb 28 17:06:25 MST 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/jetc/;
https://www.math.utah.edu/pub/tex/bib/fparith.bib;
https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Quantum-dot cellular automata (QCA) technology is now
considered to be one of the prospective technologies
for a nanocomputer creation. The physical properties of
QCA and its expanding range of computer applications
make it expedient to use the novel paradigm of
nanocomputer architecture: serial decimal
storage-transfer-processing. The delay-based encoding
of decimal digits allows the use a delay element as a
main element of QCA serial arithmetic units. The simple
implementation of the delay element by a short length
of QCA wire results in reduction of complexity and of
the area required for a QCA circuit. The theoretical
basics of delay-based processing-in-wire and design
examples of QCA serial decimal arithmetic units are
presented.",
acknowledgement = ack-nhfb,
articleno = "13",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Lin:2014:RRM,
author = "Chia-Chun Lin and Niraj K. Jha",
title = "{RMDDS}: {Reed--Muller} decision diagram synthesis of
reversible logic circuits",
journal = j-JETC,
volume = "10",
number = "2",
pages = "14:1--14:??",
month = feb,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2564923",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Fri Feb 28 17:06:25 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "In this article, we propose a flexible and efficient
reversible logic synthesizer. It exploits the
complementary advantages of two methods: Reed--Muller
Reversible Logic Synthesis (RMRLS) and Decision Diagram
Synthesis (DDS), and is thus called Reed--Muller
Decision Diagram Synthesis (RMDDS). RMRLS does not
scale to a large number of qubits (i.e., quantum bits).
DDS tools, even though efficient, add a large number of
ancillary qubits and typically incur much higher
quantum cost than necessary. RMDDS overcomes these
obstacles. It is flexible in the sense that users can
either optimize the number of qubits or the quantum
cost in the circuit implementation. It is also
efficient because the circuits can be synthesized
within user-defined CPU times. This combination of
flexibility and efficiency has been missing from
synthesizers presented earlier. When used to synthesize
reversible functions, RMDDS reduces the number of
qubits by up to 79.2\% (average of 54.6\%) when the
synthesis objective is to minimize the number of qubits
and the quantum cost by up to 71.5\% (average of
35.7\%) when the synthesis objective is to minimize
quantum cost, relative to DDS methods. For irreversible
functions (which are automatically embedded in
reversible functions), the corresponding best (average)
reductions in the number of qubits is 42.1\% (22.5\%)
when minimizing the number of qubits, and in quantum
cost, it is 63.0\% (25.9\%) when minimizing quantum
cost.",
acknowledgement = ack-nhfb,
articleno = "14",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Liu:2014:CSN,
author = "Weichen Liu and Xuan Wang and Jiang Xu and Wei Zhang
and Yaoyao Ye and Xiaowen Wu and Mahdi Nikdast and
Zhehui Wang",
title = "On-chip sensor networks for soft-error tolerant
real-time multiprocessor systems-on-chip",
journal = j-JETC,
volume = "10",
number = "2",
pages = "15:1--15:??",
month = feb,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2564928",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Fri Feb 28 17:06:25 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "As transistor density continues to increase with the
advent of nanotechnology, reliability issues raised by
the more frequent appearance of soft errors are
becoming critical for future embedded multiprocessor
systems design. State-of-the-art techniques for soft
error protections targeting multiprocessor systems
result either high chip cost and area overhead or high
performance degradation and energy consumption, and do
not fulfill the increasing requirements for high
performance and dependability. In this article we
present a systematic approach, that is, the Sensor
Networks-on-Chip (SENoC), to collaboratively and
efficiently manage on-chip applications and overcome
reliability threats to Multiprocessor Systems-on-Chip
(MPSoC). A hardware-software collaborative approach is
proposed to solve soft error problems: a hardware-based
on-chip sensor network is built for soft error
detection, and a software-based recovery mechanism is
applied for soft error correction. A two-step
scheduling scheme is presented for reliable application
and chip management, combining an off-line static
optimization stage for application performance
maximization and an online lightweight dynamic
adjustment stage to handle runtime variations and
exceptions. This strategy introduces only trivial
overhead on hardware design and much lower overhead on
software control and execution, and hence performance
degradation and energy consumption is greatly reduced.
We build a cycle-accurate simulator using SystemC, and
verify the effectiveness of our technique by comparing
performance with related techniques on several
real-world applications.",
acknowledgement = ack-nhfb,
articleno = "15",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Kim:2014:ICU,
author = "Jaeyoon Kim and Sandip Tiwari",
title = "Inexact computing using probabilistic circuits: Ultra
low-power digital processing",
journal = j-JETC,
volume = "10",
number = "2",
pages = "16:1--16:??",
month = feb,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2564925",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Fri Feb 28 17:06:25 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Numerous computing applications can tolerate low error
rates. In such applications, inexact approaches provide
the ability to achieve significantly lower power. This
work demonstrates the power-error trade-offs that can
be achieved. Using probabilistic modeling in sub-50-nm
silicon transistor technology, the relationship between
statistical uncertainties and errors are elucidated for
different configurations and topologies and the
trade-offs quantified. Gate-level implementation of the
probabilistic CMOS logic is validated by circuit
simulations of a commercial 45-nm SOI CMOS process
technology. Using a practical ALU architecture where
voltages can be scaled from most significant to least
significant bit blocks as an example, the potential
benefits of this technique are shown. A calculation
error of $ 10^{-6} $, an error rate quite tolerable for
many computational tasks, is shown to be possible with
a total power reduction of more than 40\%.",
acknowledgement = ack-nhfb,
articleno = "16",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Pierce:2014:NTN,
author = "Luke Pierce and Spyros Tragoudas",
title = "Nanopipelined threshold network synthesis",
journal = j-JETC,
volume = "10",
number = "2",
pages = "17:1--17:??",
month = feb,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2564924",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Fri Feb 28 17:06:25 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Threshold logic gates allow for complex multiinput
functions to be implemented using a single gate thereby
reducing the power and area of a circuit. Clocked
threshold gates are nanopipelined to increase network
throughput. It is shown that synthesis methods that do
not consider the synchronization of the nanopipeline
can produce an enormous amount of buffers. The proposed
algorithm synthesizes a Boolean network into a
nanopipelined threshold logic network by minimizing not
only the number of combinational clusters but also the
associated buffer insertion overhead.",
acknowledgement = ack-nhfb,
articleno = "17",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Xiang:2014:TDT,
author = "Dong Xiang and Kele Shen",
title = "A thermal-driven test application scheme for pre-bond
and post-bond scan testing of three-dimensional {ICs}",
journal = j-JETC,
volume = "10",
number = "2",
pages = "18:1--18:??",
month = feb,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2564922",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Fri Feb 28 17:06:25 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "The three-dimensional (3-D) technology offers a new
solution to the increasing density of integrated
circuits (ICs). In this work, we propose novel scan
architectures for 3-D IC pre-bond and post-bond testing
by considering the interconnection overhead of
through-silicon-vias (TSVs). Since hotspots in 3-D ICs
often cause performance and reliability issues, we also
develop different test ordering schemes for prebond and
postbond testing to avoid applying test vectors that
could worsen the temperature distribution. Experimental
results show that the peak temperature can be lowered
by 20\% with the 3-D scan tree architecture. When
combined with the test ordering scheme, the 3-D scan
tree can further reduce peak temperature by over
30\%.",
acknowledgement = ack-nhfb,
articleno = "18",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Kamal:2014:IPV,
author = "Mehdi Kamal and Ali Afzali-Kusha and Saeed Safari and
Massoud Pedram",
title = "Impact of Process Variations on Speedup and Maximum
Achievable Frequency of Extensible Processors",
journal = j-JETC,
volume = "10",
number = "3",
pages = "19:1--19:??",
month = apr,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2567665",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Mon May 5 14:50:39 MDT 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "In this article, we investigate the impact of process
variations on the speedup and maximum frequency of the
extended ISA processor. First, without considering
process variations, a custom functional unit (CFU) is
designed based on nominal timing parameters, then the
timing variations of critical paths of the extensible
processor, including the baseline processor and the
CFU, are investigated by considering both systematic
and random variations. Next, the maximum frequency of
the extensible processor and the speed enhancement
factor of the extended ISA for different benchmarks are
investigated. Results show that timing variation could
reduce the speedup of the extensible processor.
However, this reduction is highly dependent on the
baseline processor and the CFU structures.
Additionally, the impact of process variations in the
worst-case design approach is studied. Results show
that the speedup of the extensible processor is reduced
more than in the case when custom instructions (CIs)
are selected without considering process variations. To
study the impact of each variation type, speedup
variations due to random and systematic variations are
investigated separately. The study reveals that random
variation has a similar effect on the CFU and the
baseline processor, while the impact of systematic
variation on the baseline processor is greater than the
CFU.",
acknowledgement = ack-nhfb,
articleno = "19",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Chung:2014:DET,
author = "Haera Chung and Christof Teuscher and Partha Pande",
title = "Design and Evaluation of Technology-Agnostic
Heterogeneous Networks-on-Chip",
journal = j-JETC,
volume = "10",
number = "3",
pages = "20:1--20:??",
month = apr,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2567666",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Mon May 5 14:50:39 MDT 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Traditional metal-wire-based networks-on-chip (NoC)
suffer from high latency and power dissipation as the
system size scales up in the number of cores. This
limitation stems from the inherent multihop
communication nature of larger NoCs. It has previously
been shown that the performance of NoCs can be
significantly improved by introducing long-range, low
power, and high-bandwidth single-hop links between
distant cores. While previous work has focused on
specific NoC architectures and configurations, it
remains an open question whether heterogeneous link
types are beneficial in a broad range of NoC
architectures. In this article, we show that a generic
NoC architecture with heterogeneous link types allows
for NoCs with higher bandwidth at a lower cost compared
to homogeneous networks. We further show that such NoCs
scale up significantly better in terms of performance
and cost. We demonstrate these broadly-applicable
results by using a technology-agnostic complex network
approach that targets NoC architectures with various
emerging link types.",
acknowledgement = ack-nhfb,
articleno = "20",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Palaniswamy:2014:ITL,
author = "Ashok Kumar Palaniswamy and Spyros Tragoudas",
title = "Improved Threshold Logic Synthesis Using
Implicant-Implicit Algorithms",
journal = j-JETC,
volume = "10",
number = "3",
pages = "21:1--21:??",
month = apr,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2597175",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Mon May 5 14:50:39 MDT 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Existing threshold logic synthesis methods decompose
larger input functions into smaller input functions and
perform synthesis for them. It is shown that
significantly larger input functions can be synthesized
by implementing the existing methods in an
implicant-implicit manner. Experimental results on the
ISCAS 85 benchmarks show that this impacts the
synthesis cost, which drops significantly. More
specifically, as the size of the functions that can be
handled by the synthesis algorithm increases, the
number of threshold logic gates required to implement
very large input functions decreases. In addition, the
total weight decreases and the performance is
improved.",
acknowledgement = ack-nhfb,
articleno = "21",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Chen:2014:CTS,
author = "Fu-Wei Chen and Tingting Hwang",
title = "Clock-Tree Synthesis with Methodology of Reuse in
{$3$D-IC}",
journal = j-JETC,
volume = "10",
number = "3",
pages = "22:1--22:??",
month = apr,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2567668",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Mon May 5 14:50:39 MDT 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "IP reuse methodology has been used extensively in SoC
(system-on-chip) design. In this reuse methodology,
while design and implementation costs are saved,
manufacturing cost is not. To further reduce the cost,
this reuse concept has been proposed at mask and die
level in three-dimensional integrated circuits (3D-IC).
In order to achieve manufacturing reuse, in this
article, we propose a new methodology for designing a
global clock tree in 3D-IC. The objective is to extend
an existing clock tree in 2D IC to 3D IC, taking into
consideration the wirelength, clock skew, and the
number of TSVs. Compared with NNG- and 3D-MMM-based
methods, our proposed method reduces the wirelength of
the new die and the skew of the global 3D clock tree on
average, 5.85\% and 2.3\%, and 76.92\% and 48.7\%,
respectively. In more than two die design, the average
improvements of the wirelength and clock skew of our
method as compared with the 3D-MMM-based method are
4.23\% and 46.84\%, respectively.",
acknowledgement = ack-nhfb,
articleno = "22",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Liu:2014:CHP,
author = "Wulong Liu and Yu Wang and Yuchun Ma and Yuan Xie and
Huazhong Yang",
title = "On-Chip Hybrid Power Supply System for Wireless Sensor
Nodes",
journal = j-JETC,
volume = "10",
number = "3",
pages = "23:1--23:??",
month = apr,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2492683",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Mon May 5 14:50:39 MDT 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "With the miniaturization of electronic devices,
small-size but high-capacity power supply systems
appear to be more and more important. A hybrid power
source, which consists of a fuel cell (FC) and a
rechargeable battery, has the advantages of long
lifetime and good load-following capabilities. In this
article, we propose the schematic of a hybrid power
supply system that can be integrated on a chip
compatible with present CMOS processes. For the
on-chip, fuel-cell-based hybrid power system in
wireless sensor node design, we propose a two steps
optimization: (1) dynamic power management (DPM), and
(2) adaptive fuel cell optimal power point tracking
(AOPPT). Simulation results demonstrate that the
on-chip FC-Bat hybrid power system can be used for
wireless sensor nodes under different usage scenarios.
Our proposed DPM method can achieve 12.9\% more energy
savings than the method without DPM. Meanwhile,
implementing our AOPPT approach can save about 17\%
energy compared with the fixed architecture for the
fuel cell system. For an on-chip power system with
1cm$^2$ area consumption, the wafer-level battery can
power a typical sensor node for only about five months,
while our on-chip hybrid power system will supply the
same sensor node for two years steadily.",
acknowledgement = ack-nhfb,
articleno = "23",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Grissom:2014:IAC,
author = "Daniel Grissom and Christopher Curtis and Philip
Brisk",
title = "Interpreting Assays with Control Flow on Digital
Microfluidic Biochips",
journal = j-JETC,
volume = "10",
number = "3",
pages = "24:1--24:??",
month = apr,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2567669",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Mon May 5 14:50:39 MDT 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "BioCoder is a C++ library developed at Microsoft
Research, India, for the unambiguous specification of
biochemical assays. This article describes language
extensions to BioCoder along with a compiler and
runtime system that translate and execute assays
specified using BioCoder on a software simulator. The
simulator mimics the behavior of laboratories-on-a-chip
(LoCs) based on a droplet actuation technology called
electrowetting on dielectric (EWoD). To date, prior
compilers targeting similar EWoD devices are limited to
assays specified as directed acyclic graphs (DAGs) and
cannot handle arbitrary control flow or feedback from
the LoC. The framework presented herein addresses these
challenges through dynamic interpretation, thereby
enlarging the space of assays that can be compiled onto
EWoD devices.",
acknowledgement = ack-nhfb,
articleno = "24",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Yuan:2014:FEA,
author = "Bo Yuan and Bin Li",
title = "A Fast Extraction Algorithm for Defect-Free
Subcrossbar in Nanoelectronic Crossbar",
journal = j-JETC,
volume = "10",
number = "3",
pages = "25:1--25:??",
month = apr,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2517137",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Mon May 5 14:50:39 MDT 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Due to the super scale, high defect density, and
per-chip designing paradigm of emerging
nanoelectronics, the runtime of the algorithms for
defect-tolerant design is of vital importance from the
perspective of practicability. In this article, an
efficient and effective heuristic defect-free
subcrossbar extraction algorithm is proposed which
improves performance by mixing the heuristics from two
state-of-the-art algorithms and then is speeded up
significantly by considerably reducing the number of
major loops. Compared with the current most effective
algorithm that improves the solution quality (i.e.,
size of the defect-free subcrossbar obtained) at the
cost of high time complexity O ( n$^3$ ), the time
complexity of the proposed heuristic algorithm is
proved to be O ( n$^2$ ). Using a large set of
instances of various scales and defect densities, the
simulation results show that the proposed algorithm can
offer similar high-quality solutions as the current
most effective algorithm while consuming much shorter
runtimes (reduced to about 1/3 to 1/5) than the current
most effective algorithm.",
acknowledgement = ack-nhfb,
articleno = "25",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Chaudhuri:2014:VDS,
author = "Sourindra M. Chaudhuri and Niraj K. Jha",
title = "{$3$D} vs. {$2$D} Device Simulation of {FinFET} Logic
Gates under {PVT} Variations",
journal = j-JETC,
volume = "10",
number = "3",
pages = "26:1--26:??",
month = apr,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2567670",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Mon May 5 14:50:39 MDT 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Recently, multigate transistors have been gaining
attention as an alternative to conventional MOSFETs.
Superior gate control over the channel, smaller
subthreshold leakage, and reduced susceptibility to
process variations are some of the key features that
give multigate structures a competitive edge over
MOSFETs. Among various multigate structures,
silicon-on-insulator (SOI) FinFETs are promising, owing
to their ease of fabrication. However, characterization
of SOI FinFET devices/gates needs immediate attention
in order for them to gain greater popularity in this
decade. Ideally, 3D device simulation should be done
for accurate circuit analysis. However, this is
impractical due to the huge CPU time required. As a
possible alternative, simulating a 2D crosssection of
the device yields 10$ \times $ to 100$ \times $
reduction in CPU time. However, this introduces
significant error in the range of 7\% to 20\% when
evaluating the on/off current ( I$_{ON}$ /I$_{OFF}$ )
for a single device and leakage current or propagation
delay ( I$_{LEAK}$ /t$_D$ ) for logic gates. In this
work, we first present a methodology to obtain
optimized 3D device simulation models for SOI FinFETs.
Based on these 3D models, we develop adjusted 2D models
to capture 3D simulation accuracy with 2D simulation
efficiency. We report results for the 22nm SOI FinFET
technology node. We adjust gate underlap ( L$_{UN}$ )
in the 2D cross section of the n/pFinFET devices in
order to mimic 3D device behavior. When the adjusted 2D
models are employed in mixed-mode simulation of FinFET
logic gates, the error in the evaluation of I$_{LEAK}$
/t$_D$ is very small. To the best of our knowledge,
this is the first such attempt. We show that 2D device
models remain valid even under process, voltage, and
temperature (PVT) variations. We target process
variations in gate length ( L$_G$ ), fin thickness (
T$_{SI}$ ), gate oxide thickness ( T$_{OX}$ ), and gate
workfunction ( \Phi $_G$ ), which are the parameters
that have been shown to have the most impact on leakage
and delay.",
acknowledgement = ack-nhfb,
articleno = "26",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Lin:2014:POF,
author = "Jiun-Li Lin and Po-Hsun Wu and Tsung-Yi Ho",
title = "Placement optimization of flexible {TFT} circuits with
mechanical strain and temperature consideration",
journal = j-JETC,
volume = "11",
number = "1",
pages = "1:1--1:??",
month = sep,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2629497",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Mon Oct 6 16:15:58 MDT 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Mobility is the primary device parameter affecting
circuit performance in flexible thin-film transistor
(TFT) technologies, and is particularly sensitive to
the change of mechanical strain and temperature.
However, existing algorithms only consider the impact
of mechanical strain in cell placement of flexible TFT
circuits. Without taking temperature into
consideration, mobility may be dramatically decreased
which leads to circuit performance degradation. This
article presents the first work to minimize the
mobility variation caused by the change of both
mechanical strain and temperature. Experimental results
show that the proposed algorithms can effectively and
efficiently reduce the increasing critical path
delay.",
acknowledgement = ack-nhfb,
articleno = "1",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Roy:2014:TAG,
author = "Sudip Roy and Bhargab B. Bhattacharya and Sarmishtha
Ghoshal and Krishnendu Chakrabarty",
title = "Theory and analysis of generalized mixing and dilution
of biochemical fluids using digital microfluidic
biochips",
journal = j-JETC,
volume = "11",
number = "1",
pages = "2:1--2:??",
month = sep,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2629578",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Mon Oct 6 16:15:58 MDT 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Digital microfluidic (DMF) biochips are recently being
advocated for fast on-chip implementation of
biochemical laboratory assays or protocols, and several
algorithms for diluting and mixing of reagents have
been reported. However, all methods for such automatic
sample preparation suffer from a drawback that they
assume the availability of input fluids in pure form,
that is, each with an extreme concentration factor ( CF
) of 100\%. In many real-life scenarios, the stock
solutions consist of samples/reagents with multiple CF
s. No algorithm is yet known for preparing a target
mixture of fluids with a given ratio when its
constituents are supplied with random concentrations.
An intriguing question is whether or not a given target
ratio is feasible to produce from such a general input
condition. In this article, we first study the
feasibility properties for the generalized mixing
problem under the (1:1) mix-split model with an
allowable error in the target CF s not exceeding 1 2d,
where the integer d is user specified and denotes the
desired accuracy level of CF. Next, an algorithm is
proposed which produces the desired target ratio of N
reagents in ONd mix-split steps, where N ( {$>$}= 3)
denotes the number of constituent fluids in the
mixture. The feasibility analysis also leads to the
characterization of the total space of input stock
solutions from which a given target mixture can be
derived, and conversely, the space of all target
ratios, which are derivable from a given set of input
reagents with arbitrary CF s. Finally, we present a
generalized algorithm for diluting a sample S in
minimum (1:1) mix-split steps when two or more
arbitrary concentrations of S (diluted with the same
buffer) are supplied as inputs. These results settle
several open questions in droplet-based algorithmic
microfluidics and offer efficient solutions for a wider
class of on-chip sample preparation problems.",
acknowledgement = ack-nhfb,
articleno = "2",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Chen:2014:ULL,
author = "Xianmin Chen and Niraj K. Jha",
title = "Ultra-low-leakage chip multiprocessor design with
hybrid {FinFET} logic styles",
journal = j-JETC,
volume = "11",
number = "1",
pages = "3:1--3:??",
month = sep,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2629576",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Mon Oct 6 16:15:58 MDT 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "FinFET has begun replacing CMOS at the 22nm technology
node because of its enhanced ability to mitigate
short-channel effects. Although leakage power of FinFET
logic gates is lower than their CMOS counterparts, it
still contributes to a large part of total power
consumption. In this article, we show how
ultra-low-leakage FinFET chip multiprocessors (CMPs)
can be designed using a hybrid logic style. This hybrid
style exploits the ultra-low-leakage feature of
asymmetric-workfunction shorted-gate (ASG) FinFETs and
the high-performance feature of shorted-gate (SG)
FinFETs. We explore the impact of the hybrid style at
both the module and CMP levels. To do this, we have
developed FinFET logic libraries targeted at SG and ASG
logic gates, suitably characterized for various
parameters of interest. We have also modified existing
tools and created a framework to evaluate the hybrid
designs of SRAMs, caches, and CMPs. Using the design
with SG FinFETs as the baseline for comparison, our
experimental results show that the hybrid style can
reduce leakage power of execution units to as low as
10.6\% of the baseline without hurting performance,
that of SRAMs to between 21.5\% and 4.8\% of the
baseline with 0\%-8.3\% delay overhead, and that of
CMPs to 10.0\% of the baseline with negligible
performance degradation.",
acknowledgement = ack-nhfb,
articleno = "3",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Lin:2014:NTL,
author = "Ing-Chao Lin and Shun-Ming Syu and Tsung-Yi Ho",
title = "{NBTI} tolerance and leakage reduction using gate
sizing",
journal = j-JETC,
volume = "11",
number = "1",
pages = "4:1--4:??",
month = sep,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2629657",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Mon Oct 6 16:15:58 MDT 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Leakage power is a major design constraint in deep
submicron technology and below. Meanwhile, transistor
degradation due to Negative Bias Temperature
Instability (NBTI) has emerged as one of the main
reliability concerns in nanoscale technology. Gate
sizing is a widely used technique to reduce circuit
leakage, and this approach has recently attracted much
attention with regard to improving circuits to tolerate
NBTI. However, these studies only consider timing and
area constraints, and many other important issues, such
as slew and max-load, are missing. In this article, we
present an efficient gate sizing framework that can
reduce leakage and improve circuit reliability under
timing constraints. Our algorithms consider slack, slew
and max-load constraints. The benchmarks are those from
ISPD 2012, which feature industrial design properties,
including discrete cell sizes, nonconvex cell timing
models, slew dependencies and constraints, as well as
large design sizes. The experimental results obtained
from ISPD 2012 benchmark circuits demonstrate that our
approach can meet all the constraints and tolerated
NBTI degradation with a power savings of 6.54\% as
compared with the traditional method.",
acknowledgement = ack-nhfb,
articleno = "4",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Xie:2014:TCP,
author = "Jing Xie and Yang Du and Yuan Xie",
title = "Testable cross-power domain interface {(CPDI)} circuit
design in monolithic {$3$D} technology",
journal = j-JETC,
volume = "11",
number = "1",
pages = "5:1--5:??",
month = sep,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2629516",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Mon Oct 6 16:15:58 MDT 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Optimizing energy consumption for electronic systems
has been an important design consideration. Multipower
domain design is widely used for low-power and
high-performance applications. Data transfer between
power domains needs a cross-power domain interface
(CPDI). The existing level-conversion flip-flop (LCFF)
structures all need dual power rails, which lead to
large area and performance overhead. In this article,
we propose a scanable CPDI circuit, utilizing
monolithic 3D technology. This interface functions as a
flip-flop and provides reliable data conversion from
one power domain to another. It has a built-in scan
feature, which makes it a testable design. Our design
separates power rails in each tier, substantially
reducing physical design complexity and area penalty.
The design is implemented in a 20nm, 28nm, and 45nm
low-power technology. It shows a 20\%--35\% smaller
insertion delay compared to normal designs. This
proposed design also shows scalability and better
energy consumption than previous LCFF circuits.",
acknowledgement = ack-nhfb,
articleno = "5",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Kumawat:2014:PMA,
author = "Renu Kumawat and Vineet Sahula and Manoj S. Gaur",
title = "Probabilistic modeling and analysis of molecular
memory",
journal = j-JETC,
volume = "11",
number = "1",
pages = "6:1--6:??",
month = sep,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2629533",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Mon Oct 6 16:15:58 MDT 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "This article investigates the aspects of designing a
nanocell based molecular memory. An empirical model for
molecular device is developed, based on circuit
behavior of nitro-substituted Oligo (Phynylene
Ethynylene) molecule (OPE). This device model is
subsequently used to design nanocell based 1-bit memory
and verified using HSPICE. The approach is extended to
train the nanocell for multibit storage capability
using external voltage signals. It is observed that to
successfully train a 2-bit molecular memory, the number
of control signals should be approx. one-fourth of
total number of nanoparticles. A computational
framework is proposed to compute the probability of
retrieving the stored data bits correctly, at the
output terminal of the nanocell buffer. This nanocell
configuration is simulated by systematically varying
number of nanoparticles and molecular switches. It is
observed that the probability of the existence of at
least one path from input to output approaches close to
unity with presence of 20 or more nanoparticles in a
nanocell. During memory model validation, 1000 samples
of 1-bit memory (consisting of 20 nanoparticles) were
generated and verified for read and write operations.
The model verification results obtained for this memory
cell closely match those obtained using analytical
solution of probabilistic graph model.",
acknowledgement = ack-nhfb,
articleno = "6",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Lin:2014:QQM,
author = "Chia-Chun Lin and Amlan Chakrabarti and Niraj K. Jha",
title = "{QLib}: Quantum module library",
journal = j-JETC,
volume = "11",
number = "1",
pages = "7:1--7:??",
month = sep,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2629430",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Mon Oct 6 16:15:58 MDT 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Quantum algorithms are known for their ability to
solve some problems much faster than classical
algorithms. They are executed on quantum circuits,
which consist of a cascade of quantum gates. However,
synthesis of quantum circuits is not straightforward
because of the complexity of quantum algorithms.
Generally, quantum algorithms contain two parts:
classical and quantum. Thus, synthesizing circuits for
the two parts separately reduces overall synthesis
complexity. In addition, many quantum algorithms use
similar subroutines that can be implemented with
similar circuit modules. Because of their frequent use,
it is important to use automated scripts to generate
such modules efficiently. These modules can then be
subjected to further synthesis optimizations. This
article proposes QLib, a quantum module library, which
contains scripts to generate quantum modules of
different sizes and specifications for well-known
quantum algorithms. Thus, QLib can also serve as a
suite of benchmarks for quantum logic and physical
synthesis.",
acknowledgement = ack-nhfb,
articleno = "7",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Wille:2014:ISI,
author = "Robert Wille and Rolf Drechsler and Mehdi B. Tahoori",
title = "Introduction to the {Special Issue on Reversible
Computation}",
journal = j-JETC,
volume = "11",
number = "2",
pages = "8:1--8:??",
month = nov,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2663349",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Wed Nov 5 18:01:28 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
acknowledgement = ack-nhfb,
articleno = "8",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{DeVos:2014:MCC,
author = "Alexis {De Vos} and Stijn {De Baerdemacker}",
title = "Matrix Calculus for Classical and Quantum Circuits",
journal = j-JETC,
volume = "11",
number = "2",
pages = "9:1--9:??",
month = nov,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2669370",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Wed Nov 5 18:01:28 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Quantum computation on $w$ qubits is represented by
the infinite unitary group $ {\rm U}(2^w) $; classical
reversible computation on $w$ bits is represented by
the finite symmetric group $ {\rm S}_2^w$. In order to
establish the relationship between classical reversible
computing and quantum computing, we introduce two Lie
subgroups $ {\rm XU}(n)$ and $ {\rm ZU}(n)$ of the
unitary group $ {\rm U}(n)$. The former consists of all
unitary $ n \times n$ matrices with all line sums equal
to $1$; the latter consists of all unitary diagonal $ n
\times n$ matrices with first entry equal to $1$. Such
a group structure also reveals the relationship between
matrix calculus and diagrammatic $ z x$-calculus of
quantum circuits.",
acknowledgement = ack-nhfb,
articleno = "9",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Hanninen:2014:QII,
author = "Ismo K. H{\"a}nninen and Craig S. Lent and Gregory L.
Snider",
title = "Quantifying Irreversible Information Loss in Digital
Circuits",
journal = j-JETC,
volume = "11",
number = "2",
pages = "10:1--10:??",
month = nov,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2629523",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Wed Nov 5 18:01:28 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Heat generation limits the performance of
state-of-the-art integrated circuits, originating from
the wasteful static CMOS operating principle. Near-term
solutions like adiabatic charging for energy recovery
and limiting friction-type heat sources provide
considerable improvement. However, these methods do not
address the ultimate thermodynamic necessity to expel
energy related to information loss in the computing
process. In emerging beyond-CMOS technologies, this bit
erasure heat alone can overwhelm the cooling capacity
and set the limits of the computing performance.
Therefore, logical information loss is becoming an
important factor for digital circuit design, and tools
have to be developed for analysis and optimization.
This article presents a framework for estimating the
amount of information loss in complex logic circuits,
demonstrating the method by modeling the irreversible
bit erasures in a standard binary adder structure.
Binary addition is one of the most often used and
highly optimized digital designs, and we estimate the
erasure bounds for components on various levels of
design abstraction, showing that the actual logic gate
implementations have orders of magnitude higher loss
than the addition operation itself would require. The
method and the results can be used to optimize circuits
for a higher degree of logical reversibility and energy
conservation.",
acknowledgement = ack-nhfb,
articleno = "10",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{DeVos:2014:DGF,
author = "Alexis {De Vos} and St{\'e}phane Burignat and Robert
Gl{\"u}ck and Torben {\AE}gidius Mogensen and Holger
Bock Axelsen and Michael Kirkedal Thomsen and Eva
Rotenberg and Tetsuo Yokoyama",
title = "Designing Garbage-Free Reversible Implementations of
the Integer Cosine Transform",
journal = j-JETC,
volume = "11",
number = "2",
pages = "11:1--11:??",
month = nov,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2629532",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Wed Nov 5 18:01:28 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Discrete linear transformations are important tools in
information processing. Many such transforms are
injective and therefore prime candidates for a
physically reversible implementation into hardware. We
present here reversible integer cosine transformations
on $n$ input integers. The resulting reversible circuit
is able to perform both the forward transform and the
inverse transform. The detailed structure of such a
reversible design strongly depends on the odd prime
factors of the determinant of the transform: whether
those are of the form $ 2^k \pm 1 $ or of the form $
2^k \pm 2^l \pm 1 $ or neither of these forms.",
acknowledgement = ack-nhfb,
articleno = "11",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Mogensen:2014:GFR,
author = "Torben {\AE}gidius Mogensen",
title = "Garbage-Free Reversible Multipliers for Arbitrary
Constants",
journal = j-JETC,
volume = "11",
number = "2",
pages = "12:1--12:??",
month = nov,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2629515",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Wed Nov 5 18:01:28 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "We present a method based on Mealy machines for
constructing reversible circuitry for multiplying
integers by arbitrary integer constants. The circuits
generate no garbage and use no ancillae. The circuits
are quite compact for small constants and are, in the
worst case, bounded by $ O(n^2) $ multi-control Toffoli
gates per bit-slice, where $n$ is the number of bits in
the constant. These gates will have $ O(n)$ inputs, so
the total number of pass-transistors needed to
implement the circuit is $ O(n^3) $ transistors per bit
slice, and the quantum cost (which is exponential in
the number of inputs to a Toffoli gate) is $ O(2^n)$.
For some interesting cases, the cost can be reduced to
$ O(n)$ gates per bit-slice, reducing the cost to $
O(n^2)$ transistors per bit slice. The quantum cost is
still $ O(2^n)$, as the remaining gates have $ O(n)$
inputs. We also look at an alternative construction
that, at the cost of adding $ O(n)$ ancillae, reduces
the cost for arbitrary constants to $ O(n)$ gates, $
O(n^2)$ transistors, though still with $ O(2^n)$
quantum cost.",
acknowledgement = ack-nhfb,
articleno = "12",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Nguyen:2014:RED,
author = "Trung Duc Nguyen and Rodney {Van Meter}",
title = "A Resource-Efficient Design for a Reversible Floating
Point Adder in Quantum Computing",
journal = j-JETC,
volume = "11",
number = "2",
pages = "13:1--13:??",
month = nov,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2629525",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Wed Nov 5 18:01:28 MST 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/jetc/;
https://www.math.utah.edu/pub/tex/bib/fparith.bib;
https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Reversible logic has applications in low-power
computing and quantum computing. However, there are few
existing designs for reversible floating-point adders
and none suitable for quantum computation. In this
article, we propose a resource-efficient reversible
floating-point adder, suitable for binary quantum
computation, improving the design of Nachtigal et al.
[2011]. Our work focuses on improving the reversible
designs of the alignment unit and the normalization
unit, which are the most expensive parts. By changing a
few elements of the existing algorithm, including the
circuit designs of the RLZC (reversible leading zero
counter) and converter, we have reduced the cost by
about 68\%. We also propose quantum designs adapted to
use gates from fault-tolerant libraries. The KQ for our
fault-tolerant design is almost 60 times as expensive
as for a 32-bit fixed-point addition. We note that the
floating-point representation makes in-place, truly
reversible arithmetic impossible, requiring us to
retain both inputs, which limits the sustainability of
its use for quantum computation.",
acknowledgement = ack-nhfb,
articleno = "13",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Shafaei:2014:CSR,
author = "Alireza Shafaei and Mehdi Saeedi and Massoud Pedram",
title = "Cofactor Sharing for Reversible Logic Synthesis",
journal = j-JETC,
volume = "11",
number = "2",
pages = "14:1--14:??",
month = nov,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2629524",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Wed Nov 5 18:01:28 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Improving circuit realization of known quantum
algorithms by CAD techniques has benefits for quantum
experimentalists. In this article, the problem of
synthesizing a given function on a set of ancillea is
addressed. The proposed approach benefits from
extensive sharing of cofactors among cubes that appear
on function outputs. Accordingly, it can be considered
a multilevel logic optimization technique for
reversible circuits. In particular, the suggested
approach can efficiently implement any $n$-input,
$m$-output lookup table (LUT) by a reversible circuit.
This problem has interesting applications in the Shor's
number-factoring algorithm and in quantum walk on
sparse graphs. Simulation results reveal that the
proposed cofactor-sharing synthesis algorithm has a
significant impact on reducing the size of modular
exponentiation circuits for Shor's quantum factoring
algorithm, oracle circuits in quantum walk on sparse
graphs, and the well-known MCNC benchmarks.",
acknowledgement = ack-nhfb,
articleno = "14",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Datta:2014:IRC,
author = "Kamalika Datta and Gaurav Rathi and Indranil Sengupta
and Hafizur Rahaman",
title = "An Improved Reversible Circuit Synthesis Approach
using Clustering of {ESOP} Cubes",
journal = j-JETC,
volume = "11",
number = "2",
pages = "15:1--15:??",
month = nov,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2629543",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Wed Nov 5 18:01:28 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "The problem of reversible logic synthesis has drawn
the attention of many researchers over the last two
decades with growing emphasis on low-power design.
Among the various synthesis approaches that have been
reported, the ones based on compact circuit
representations like Binary Decision Diagrams (BDD) and
Exclusive-or Sum-Of-Products (ESOP) are interesting in
the sense that they can handle large circuits with more
than 100 inputs. The drawback of these approaches,
however, is that the generated netlists are
sub-optimal, and there is lot of scope for optimizing
them. One of the best methods in this regard is an
approach, where the ESOP cubes are grouped into
sublists based on sharing among more than one outputs.
In the work reported in this article, in contrast, an
approach based on clustering the ESOP cubes based on
their similarity with respect to input variables is
presented, along with a technique to map each of the
clusters into reversible gate netlists. This approach
results in a significant reduction in quantum cost of
the final netlist, but requires one additional garbage
line. Experimental results on a number of reversible
circuit benchmarks have been presented in support of
the claim and also demonstrate that the method is very
fast.",
acknowledgement = ack-nhfb,
articleno = "15",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Tida:2014:NTS,
author = "Umamaheswara Rao Tida and Cheng Zhuo and Yiyu Shi",
title = "Novel Through-Silicon-Via Inductor-Based On-Chip
{DC--DC} Converter Designs in {$3$D} {ICs}",
journal = j-JETC,
volume = "11",
number = "2",
pages = "16:1--16:??",
month = nov,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2637481",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Wed Nov 5 18:01:28 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "There has been a tremendous research effort in recent
years to move DC-DC converters on chip for enhanced
performance. However, a major limiting factor to
implementing on-chip inductive DC-DC converters is the
large area overhead induced by spiral inductors. Thus,
we propose using through-silicon-vias (TSVs), a
critical enabling technique in three-dimensional (3D)
integrated systems, to implement on-chip inductors for
DC-DC converters. While existing literature show that
TSV inductors are inferior compared with conventional
spiral inductors due to substrate loss for RF
applications, in this article, we demonstrate that it
is not the case for DC-DC converters, which operate at
relatively low frequencies. Experimental results show
that by replacing conventional spiral inductors with
TSV inductors, with almost the same efficiency and
output voltage, up to $ 4.3 \times $ and $ 3.2 \times $
inductor area reduction can be achieved for the
single-phase buck converter and the interleaved buck
converter with magnetic coupling, respectively.",
acknowledgement = ack-nhfb,
articleno = "16",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Murray:2014:PEC,
author = "Jacob Murray and Ryan Kim and Paul Wettin and Partha
Pratim Pande and Behrooz Shirazi",
title = "Performance Evaluation of Congestion-Aware Routing
with {DVFS} on a Millimeter-Wave Small-World Wireless
{NoC}",
journal = j-JETC,
volume = "11",
number = "2",
pages = "17:1--17:??",
month = nov,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2644816",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Wed Nov 5 18:01:28 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "The mm-wave small-world wireless NoC (mSWNoC) has
emerged as an enabling interconnection infrastructure
for designing high-bandwidth and energy-efficient
multicore chips. In this mSWNoC architecture,
long-range communication predominately takes place
through the wireless shortcuts operating in the range
of 10--100GHz, whereas short-range data exchange occurs
through conventional metal wires. This results in
performance advantages (lower latency and energy
dissipation), mainly stemming from using the wireless
links as long-range shortcuts between far-apart cores.
The performance gain introduced by the wireless
channels can be enhanced further if the wireline links
of the mSWNoC are optimized according to the traffic
patterns arising out of the application workloads.
While there is significant energy savings, and hence
temperature reduction, in the network due to the mSWNoC
architecture, a load-imbalanced network is still
susceptible to local temperature hotspots. In this
work, we demonstrate that by incorporating
congestion-avoidance routing with network-level dynamic
voltage and frequency scaling (DVFS) in an mSWNoC, the
power and thermal profiles can be improved without a
significant impact on the overall network performance.
In this work, we demonstrate how novel interconnect
architectures enabled by the on-chip wireless links
coupled with power management strategies can improve
the energy and thermal characteristics of an mSWNoC
significantly without introducing any performance
degradation with respect to the conventional mesh-based
NoC.",
acknowledgement = ack-nhfb,
articleno = "17",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Mohanty:2014:SOS,
author = "Pragyan (Sheela) Mohanty and Spyros Tragoudas",
title = "Scalable Offline Searches in {DNA} Sequences",
journal = j-JETC,
volume = "11",
number = "2",
pages = "18:1--18:??",
month = nov,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2660774",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Wed Nov 5 18:01:28 MST 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/jetc/;
https://www.math.utah.edu/pub/tex/bib/jetc.bib;
https://www.math.utah.edu/pub/tex/bib/string-matching.bib",
abstract = "Searching for a particular pattern in a very large DNA
database is a fundamental and essential component in
computational biology. In the biological world, pattern
matching is required for finding repeats in a
particular DNA sequence, finding motif, aligning
sequences, and other similar tasks. Due to an immense
amount and continuous increase of biological data, the
searching process requires very fast algorithms. A
function-based tool set for fast offline pattern
searches in large DNA sequences is proposed. The method
benefits from the use of Boolean functions, their
compact storage using canonical data structure, and the
existence of built-in operators for these data
structures. Experiments on DNA sequences from the NCBI
database show that the proposed approach is scalable.
The time complexity depends on the size of the data
structure used for storing the function that represents
the DNA sequence. It is shown that the presented
approach exhibits sublinear time complexity to the DNA
sequence size.",
acknowledgement = ack-nhfb,
articleno = "18",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Chaudhuri:2014:ALD,
author = "Sourindra M. Chaudhuri and Prateek Mishra and Niraj K.
Jha",
title = "Accurate Leakage\slash Delay Estimation for {FinFET}
Standard Cells under {PVT} Variations using the
Response Surface Methodology",
journal = j-JETC,
volume = "11",
number = "2",
pages = "19:1--19:??",
month = nov,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2665066",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Wed Nov 5 18:01:28 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Among different multi-gate transistors, FinFETs and
Trigate FETs have set themselves apart as the most
promising candidates for the upcoming 22nm technology
node and beyond owing to their superior device
performance, lower leakage power consumption, and
cost-effective fabrication process. Innovative circuit
design and optimization techniques will be required to
harness the power of multi-gate transistors, which in
turn will depend on accurate leakage and timing
characterization of these devices under spatial and
environmental variations. Hence, in order to aid
circuit designers, we present accurate analytical
models using central composite rotatable design (CCRD)
based on response surface methodology (RSM) to estimate
the leakage current and delay of FinFET standard cells
under the effect of variations in gate length ($ L_G$),
fin thickness ($ T_{SI}$), gate-oxide thickness ($
T_{OX}$), gate-workfunction ($ \Phi_G$), supply voltage
($ V{_DD}$), and temperature ($T$). To the best of our
knowledge, this is the first such attempt to develop
analytical models for leakage/delay estimation of
FinFET logic gates. To derive these models, we employ
TCAD device simulations of adjusted 2D device cross
sections that have been shown to track TCAD device
simulations of 3D device behavior within a 1--3\% error
range. This drastically reduces the CPU time of our
modeling technique (by several orders of magnitude)
without much loss in accuracy. We present analytical
leakage and delay models for different sizes and logic
styles (e.g., shorted-gate (SG) and independent-gate
(IG) FinFETs at the 22nm technology node). Both leakage
and delay estimates derived from the analytical models
are in close agreement with quasi-Monte Carlo (QMC)
simulation results (QMC simulations track the accuracy
of Monte Carlo simulations, but are several orders of
magnitude faster) obtained for different adjusted-2D
logic gates with a root mean square error (RMSE) in the
0.23\%--5.87\% range.",
acknowledgement = ack-nhfb,
articleno = "19",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Myers:2014:ISI,
author = "Chris J. Myers and Herbert Sauro and Anil Wipat",
title = "Introduction to the Special Issue on Computational
Synthetic Biology",
journal = j-JETC,
volume = "11",
number = "3",
pages = "20:1--20:??",
month = dec,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2668126",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Wed Jan 7 15:40:14 MST 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "The goal of this special issue is to introduce the
field of computational synthetic biology to engineers
and computer scientists. The first article gives an
introduction to the key biological principles and
experimental techniques that support synthetic biology,
and it draws analogies with the computing field. This
issue also includes five original research articles in
computational synthetic biology. The first research
article discusses how standards can be used to
modularize the design process for genetic circuits. The
next two articles introduce new abstraction techniques
to improve the efficiency of analysis of genetic
circuit models. The last two articles introduce new
design techniques that help decouple design from
construction. We hope this sampling from the field will
help to motivate others to join this exciting and rich
area of research.",
acknowledgement = ack-nhfb,
articleno = "20",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Deans:2014:PNS,
author = "Tara L. Deans",
title = "Parallel Networks: Synthetic Biology and Artificial
Intelligence",
journal = j-JETC,
volume = "11",
number = "3",
pages = "21:1--21:??",
month = dec,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2667229",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Wed Jan 7 15:40:14 MST 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Synthetic biology has emerged as an important
technology for engineering cells to behave in
controllable and predictable ways. The promise of this
modern technology is dependent on our understanding of
cellular complexity to allow us to engineer cells with
novel function. In this regard, the fields of computer
science and synthetic biology are critical for
accelerating both our understanding of biological
systems, and our ability to quantitatively engineer
cells. Thus, advances in biology and biotechnology are
arising at the intersection of computer science and
synthetic biology approaches. This review seeks to
introduce the field of synthetic biology to the
computer science community, and to ignite a curiosity
and interest in fostering a unique synergy for possible
collaborations between synthetic biologists and
computer scientists.",
acknowledgement = ack-nhfb,
articleno = "21",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Misirli:2014:CMM,
author = "Goksel Misirli and Jennifer Hallinan and Anil Wipat",
title = "Composable Modular Models for Synthetic Biology",
journal = j-JETC,
volume = "11",
number = "3",
pages = "22:1--22:??",
month = dec,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2631921",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Wed Jan 7 15:40:14 MST 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Modelling and computational simulation are crucial for
the large-scale engineering of biological circuits
since they allow the system under design to be
simulated prior to implementation in vivo. To support
automated, model-driven design it is desirable that in
silico models are modular, composable and use standard
formats. The synthetic biology design process typically
involves the composition of genetic circuits from
individual parts. At the most basic level, these parts
are representations of genetic features such as
promoters, ribosome binding sites (RBSs), and coding
sequences (CDSs). However, it is also desirable to
model the biological molecules and behaviour that arise
when these parts are combined in vivo. Modular models
of parts can be composed and their associated systems
simulated, facilitating the process of model-centred
design. The availability of databases of modular models
is essential to support software tools used in the
model-driven design process. In this article, we
present an approach to support the development of
composable, modular models for synthetic biology,
termed Standard Virtual Parts. We then describe a
programmatically accessible and publicly available
database of these models to allow their use by
computational design tools.",
acknowledgement = ack-nhfb,
articleno = "22",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Madsen:2014:SMC,
author = "Curtis Madsen and Zhen Zhang and Nicholas Roehner and
Chris Winstead and Chris Myers",
title = "Stochastic Model Checking of Genetic Circuits",
journal = j-JETC,
volume = "11",
number = "3",
pages = "23:1--23:??",
month = dec,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2644817",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Wed Jan 7 15:40:14 MST 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Synthetic genetic circuits have a number of exciting
potential applications such as cleaning up toxic waste,
hunting and killing tumor cells, and producing drugs
and bio-fuels more efficiently. When designing and
analyzing genetic circuits, researchers are often
interested in the probability of observing certain
behaviors. Discerning these probabilities typically
involves simulating the circuit to produce some time
series data and computing statistics over the resulting
data. However, for very rare behaviors of complex
genetic circuits, it becomes computationally
intractable to obtain good results as the number of
required simulation runs grows exponentially. It is,
therefore, necessary to apply numerical methods to
determine these probabilities directly. This article
describes how stochastic model checking, a method for
determining the likelihood that certain events occur in
a system, can by applied to models of genetic circuits
by translating them into continuous-time Markov chains
(CTMCs) and analyzing them using Markov chain analysis
to check continuous stochastic logic (CSL) properties.
The utility of this approach is demonstrated with
several case studies illustrating how this method can
be used to perform design space exploration of two
genetic oscillators and two genetic state-holding
elements. Our results show that this method results in
a substantial speedup as compared with conventional
simulation-based approaches.",
acknowledgement = ack-nhfb,
articleno = "23",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Fellermann:2014:FMD,
author = "Harold Fellermann and Maik Hadorn and Rudolf M.
F{\"u}chslin and Natalio Krasnogor",
title = "Formalizing Modularization and Data Hiding in
Synthetic Biology",
journal = j-JETC,
volume = "11",
number = "3",
pages = "24:1--24:??",
month = dec,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2667231",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Wed Jan 7 15:40:14 MST 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Biological systems employ compartmentalization and
other co-localization strategies in order to
orchestrate a multitude of biochemical processes by
simultaneously enabling ``data hiding'' and
modularization. This article presents recent research
that embraces compartmentalization and co-location as
an organizational programmatic principle in synthetic
biological and biomimetic systems. In these systems,
artificial vesicles and synthetic minimal cells are
envisioned as nanoscale reactors for programmable
biochemical synthesis and as chassis for molecular
information processing. We present P systems, brane
calculi, and the recently developed chemtainer calculus
as formal frameworks providing data hiding and
modularization and thus enabling the representation of
highly complicated hierarchically organized
compartmentalized reaction systems. We demonstrate how
compartmentalization can greatly reduce the complexity
required to implement computational functionality, and
how addressable compartments permit the scaling-up of
programmable chemical synthesis.",
acknowledgement = ack-nhfb,
articleno = "24",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Oberortner:2014:RBD,
author = "Ernst Oberortner and Swapnil Bhatia and Erik Lindgren
and Douglas Densmore",
title = "A Rule-Based Design Specification Language for
Synthetic Biology",
journal = j-JETC,
volume = "11",
number = "3",
pages = "25:1--25:??",
month = dec,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2641571",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Wed Jan 7 15:40:14 MST 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Synthetic Biology is an engineering discipline where
parts of DNA sequences are composed into novel, complex
systems that execute a desired biological function.
Functioning and well-behaving biological systems adhere
to a certain set of biological ``rules''. Data exchange
standards and Bio-Design Automation (BDA) tools support
the organization of part libraries and the exploration
of rule-compliant compositions. In this work, we
formally define a design specification language,
enabling the integration of biological rules into the
Synthetic Biology engineering process. The supported
rules are divided into five categories: Counting,
Pairing, Positioning, Orientation, and Interactions. We
formally define the semantics of each rule,
characterize the language's expressive power, and
perform a case study in that we iteratively design a
genetic Priority Encoder circuit following two
alternative paradigms-rule-based and template-driven.
Ultimately, we touch a method to approximate the
complexity and time to computationally enumerate all
rule-compliant designs. Our specification language may
or may not be expressive enough to capture all designs
that a Synthetic Biologist might want to describe, or
the complexity one might find through experiments.
However, computational support for the acquisition,
specification, management, and application of
biological rules is inevitable to understand the
functioning of biology.",
acknowledgement = ack-nhfb,
articleno = "25",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Huang:2014:FMD,
author = "Haiyao Huang and Douglas Densmore",
title = "{Fluigi}: Microfluidic Device Synthesis for Synthetic
Biology",
journal = j-JETC,
volume = "11",
number = "3",
pages = "26:1--26:??",
month = dec,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2660773",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Wed Jan 7 15:40:14 MST 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "One goal of synthetic biology is to design and build
genetic circuits in living cells for a range of
applications. Our incomplete knowledge of the effects
of metabolic load and biological ``crosstalk'' on the
host cell make it difficult to construct multilevel
genetic logic circuits in a single cell, limiting the
scalability of engineered biological systems.
Microfluidic technologies provide reliable and scalable
construction of synthetic biological systems by
allowing compartmentalization of cells encoding simple
genetic circuits and the spatiotemporal control of
communication among these cells. This control is
achieved via valves on the microfluidics chip which
restrict fluid flow when activated. We describe a
Computer Aided Design (CAD) framework called ``Fluigi''
for optimizing the layout of genetic circuits on a
microfluidic chip, generating the control sequence of
the associated signaling fluid valves, and simulating
the behavior of the configured biological circuits. We
demonstrate the capabilities of Fluigi on a set of
Boolean algebraic benchmark circuits found in both
synthetic biology and electrical engineering and a set
of assay-based benchmark circuits. The integration of
microfluidics and synthetic biology has the capability
to increase the scale of engineered biological systems
for applications in DNA assembly, biosensors, and
screening assays for novel orthogonal genetic parts.",
acknowledgement = ack-nhfb,
articleno = "26",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Hadjam:2014:RED,
author = "Fatima Zohra Hadjam and Claudio Moraga",
title = "{RIMEP2}: Evolutionary Design of Reversible Digital
Circuits",
journal = j-JETC,
volume = "11",
number = "3",
pages = "27:1--27:??",
month = dec,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2629534",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Wed Jan 7 15:40:14 MST 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "RIMEP (Reversible Improved Multi Expression
Programming), is a system that has been developed for
designing reversible digital circuits. This article
discloses a new version of RIMEP called ``RIMEP2''. The
goal was to evolve reversible circuits in a ``fanout
free'' search space. The major changes that RIMEP has
undergone, are made in the structure of the chromosome
and in the fitness calculation. Although the changes
seem to be minor, the impact is effective. The
execution time has been considerably decreased and
optimal competitive solutions were found for a set of
30 selected benchmarks, where a quantum cost reduction
up to 96.13\% was reached with an average of 42.17\%.",
acknowledgement = ack-nhfb,
articleno = "27",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Houshmand:2014:DDH,
author = "Mahboobeh Houshmand and Morteza Saheb Zamani and Mehdi
Sedighi and Mona Arabzadeh",
title = "Decomposition of Diagonal {Hermitian} Quantum Gates
Using Multiple-Controlled {Pauli} {Z} Gates",
journal = j-JETC,
volume = "11",
number = "3",
pages = "28:1--28:??",
month = dec,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2629526",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Wed Jan 7 15:40:14 MST 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Quantum logic decomposition refers to decomposing a
given quantum gate to a set of physically implementable
gates. An approach has been presented to decompose
arbitrary diagonal quantum gates to a set of
multiplexed-rotation gates around z axis. In this
article, a special class of diagonal quantum gates,
namely diagonal Hermitian quantum gates, is considered
and a new perspective to the decomposition problem with
respect to decomposing these gates is presented. It is
first shown that these gates can be decomposed to a set
that solely consists of multiple-controlled Z gates.
Then a binary representation for the diagonal Hermitian
gates is introduced. It is shown that the binary
representations of multiple-controlled Z gates form a
basis for the vector space that is produced by the
binary representations of all diagonal Hermitian
quantum gates. Moreover, the problem of decomposing a
given diagonal Hermitian gate is mapped to the problem
of writing its binary representation in the specific
basis mentioned previously. Moreover, CZ gate is
suggested to be the two-qubit gate in the decomposition
library, instead of previously used CNOT gate.
Experimental results show that the proposed approach
can lead to circuits with lower costs in comparison
with the previous ones.",
acknowledgement = ack-nhfb,
articleno = "28",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Li:2014:SAB,
author = "Zhiqiang Li and Hanwu Chen and Xiaoyu Song and Marek
Perkowski",
title = "A Synthesis Algorithm for $4$-Bit Reversible Logic
Circuits with Minimum Quantum Cost",
journal = j-JETC,
volume = "11",
number = "3",
pages = "29:1--29:??",
month = dec,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2629542",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Wed Jan 7 15:40:14 MST 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "This article presents an algorithm which can quickly
find the exact minimum solution to almost all of 4-bit
reversible functions. We assume minimization of quantum
cost (MQC). This algorithm is designed in the most
memory-efficient way, or it will quickly run out of
memory. Therefore, we construct the shortest coding of
permutations, the topological compression and flexible
data structures for the memory savings. First, hash
tables are used for all 8-gate 4-bit circuits with the
minimization of gate count (MGC) by using the GT
library (with NOT, CNOT, Toffoli and Toffoli-4 gates).
Second, we merge and split the hash tables, thus
generating a single longer hash table for
high-performance. Third, we synthesize these circuits
with MQC by using the GTP library (with GT, Peres, and
Inverted Peres gates) based on the hash table. Finally,
according to the comparison of the QC of circuits, the
algorithm can quickly converge for any 4-bit reversible
circuit with MQC. By synthesizing all benchmark
functions, in comparison with Szyprowski and Kerntopf
[2011], the running time and QC are reduced up to
99.95\% and 18.2\%, respectively.",
acknowledgement = ack-nhfb,
articleno = "29",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Sen:2014:RRC,
author = "Bibhash Sen and Manojit Dutta and Samik Some and
Biplab K. Sikdar",
title = "Realizing Reversible Computing in {QCA} Framework
Resulting in Efficient Design of Testable {ALU}",
journal = j-JETC,
volume = "11",
number = "3",
pages = "30:1--30:??",
month = dec,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2629538",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Wed Jan 7 15:40:14 MST 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Reversible logic is emerging as a prospective logic
design style for implementing ultra-low-power VLSI
circuits. It promises low-power consuming circuits by
nullifying the energy dissipation in irreversible
logic. On the other hand, as a potential alternative to
CMOS technology, Quantum-dot Cellular Automata (QCA)
promises energy efficient digital design with high
device density and high computing speed. The
integration of reversible logic in QCA circuit is
expected to be effective in addressing the issue of
energy dissipation at nano scale regime. This work
targets the design of reversible ALU (arithmetic logic
unit) in QCA framework and proposes a new ``Reversible
QCA'' (RQCA). The primary design focus is on optimizing
the number of reversible gates, quantum cost and the
garbage outputs that are the most important hindrances
in realizing reversible logic. Besides optimization,
the fault coverage capability of RQCA under
missing/additional cell deposition defects is analysed.
The scope of reversible logic is further outstretched
by introducing a novel DFT (design for testability)
architecture around the reversible ALU that reduces
testing overhead. The performance of proposed ALU is
evaluated, subjected to different faults, and is
established to be more effective than the existing
ALU.",
acknowledgement = ack-nhfb,
articleno = "30",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Rahman:2014:AQT,
author = "Md. Mazder Rahman and Gerhard W. Dueck and Joseph D.
Horton",
title = "An Algorithm for Quantum Template Matching",
journal = j-JETC,
volume = "11",
number = "3",
pages = "31:1--31:??",
month = dec,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2629537",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Wed Jan 7 15:40:14 MST 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Quantum circuits are often generated by decomposing
gates from networks with classical reversible gates.
Only in rare cases, the results are minimal.
Post-optimization methods, such as template matching,
are employed to reduce the quantum costs of circuits.
Quantum templates are derived from identity circuits.
All minimal realizations, within certain limitations,
can be embedded into templates. Due to this property,
templates matching has the potential to reduce quantum
costs of circuits. However, one of the difficulties in
finding templates matches is due to the mobility of the
gates within the circuit. Thus far, template matching
procedures have employed heuristics to reduce the
search space. This article presents an in-depth study
of exact template matching with a set of algorithms. A
graph structure with the corresponding circuits
facilitates the discovery of potential sequences of
templates to be matched, and how exact minimization of
circuits can be accomplished. The significance of the
proposed method is verified in benchmarks
optimization.",
acknowledgement = ack-nhfb,
articleno = "31",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Hammerstrom:2015:ISI,
author = "Dan Hammerstrom and Vijaykrishnan Narayanan",
title = "Introduction to Special Issue on Neuromorphic
Computing",
journal = j-JETC,
volume = "11",
number = "4",
pages = "32:1--32:??",
month = apr,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2728709",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Tue Apr 28 05:59:37 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
acknowledgement = ack-nhfb,
articleno = "32",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Rodriguez:2015:TSS,
author = "Laurent Rodriguez and Beno{\^\i}t Miramond and
Bertrand Granado",
title = "Toward a Sparse Self-Organizing Map for Neuromorphic
Architectures",
journal = j-JETC,
volume = "11",
number = "4",
pages = "33:1--33:??",
month = apr,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2638559",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Tue Apr 28 05:59:37 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Neurobiological systems have often been a source of
inspiration for computational science and engineering,
but in the past their impact has also been limited by
the understanding of biological models. Today, new
technologies lead to an equilibrium situation where
powerful and complex computers bring new biological
knowledge of the brain behavior. At this point, we
possess sufficient understanding to both imagine new
brain-inspired computing paradigms and to sustain a
classical paradigm which reaches its end programming
and intellectual limitations. In this context, we
propose to reconsider the computation problem first in
the specific domain of mobile robotics. Our main
proposal consists in considering computation as part of
a global adaptive system, composed of sensors,
actuators, a source of energy and a controlling unit.
During the adaptation process, the proposed
brain-inspired computing structure does not only
execute the tasks of the application but also reacts to
the external stimulation and acts on the emergent
behavior of the system. This approach is inspired by
cortical plasticity in mammalian brains and suggests
developing the computation architecture along the
system's experience. This article proposes modeling
this plasticity as a problem of estimating a
probability density function. This function would
correspond to the nature and the richness of the
environment perceived through multiple modalities. We
define and develop a novel neural model solving the
problem in a distributed and sparse manner. And we
integrate this neural map into a bio-inspired hardware
substrate that brings the plasticity property into
parallel many-core architectures. The approach is then
called Hardware Plasticity. The results show that the
self-organization properties of our model solve the
problem of multimodal sensory data clusterization. The
properties of the proposed model allow envisaging the
deployment of this adaptation layer into hardware
architectures embedded into the robot's body in order
to build intelligent controllers.",
acknowledgement = ack-nhfb,
articleno = "33",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Chabi:2015:CUS,
author = "Djaafar Chabi and Weisheng Zhao and Damien Querlioz
and Jacques-Olivier Klein",
title = "On-Chip Universal Supervised Learning Methods for
Neuro-Inspired Block of Memristive Nanodevices",
journal = j-JETC,
volume = "11",
number = "4",
pages = "34:1--34:??",
month = apr,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2629503",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Tue Apr 28 05:59:37 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Scaling down beyond CMOS transistors requires the
combination of new computing paradigms and novel
devices. In this context, neuromorphic architecture is
developed to achieve robust and ultra-low power
computing systems. Memristive nanodevices are often
associated with this architecture to implement
efficiently synapses for ultra-high density. In this
article, we investigate the design of a neuro-inspired
logic block (NLB) dedicated to on-chip function
learning and propose learning strategy. It is composed
of an array of memristive nanodevices as synapses
associated to neuronal circuits. Supervised learning
methods are proposed for different type of memristive
nanodevices and simulations are performed to
demonstrate the ability to learn logic functions with
memristive nanodevices. Benefiting from a compact
implementation of neuron circuits and the optimization
of learning process, this architecture requires small
number of nanodevices and moderate power consumption.",
acknowledgement = ack-nhfb,
articleno = "34",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Coussy:2015:FBN,
author = "Philippe Coussy and Cyrille Chavet and Hugues Nono
Wouafo and Laura Conde-Canencia",
title = "Fully Binary Neural Network Model and Optimized
Hardware Architectures for Associative Memories",
journal = j-JETC,
volume = "11",
number = "4",
pages = "35:1--35:??",
month = apr,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2629510",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Tue Apr 28 05:59:37 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Brain processes information through a complex
hierarchical associative memory organization that is
distributed across a complex neural network. The GBNN
associative memory model has recently been proposed as
a new class of recurrent clustered neural network that
presents higher efficiency than the classical models.
In this article, we propose computational
simplifications and architectural optimizations of the
original GBNN. This work leads to significant
complexity and area reduction without affecting neither
memorizing nor retrieving performance. The obtained
results open new perspectives in the design of
neuromorphic hardware to support large-scale
general-purpose neural algorithms.",
acknowledgement = ack-nhfb,
articleno = "35",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Krichmar:2015:LSS,
author = "Jeffrey L. Krichmar and Philippe Coussy and Nikil
Dutt",
title = "Large-Scale Spiking Neural Networks using Neuromorphic
Hardware Compatible Models",
journal = j-JETC,
volume = "11",
number = "4",
pages = "36:1--36:??",
month = apr,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2629509",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Tue Apr 28 05:59:37 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Neuromorphic engineering is a fast growing field with
great potential in both understanding the function of
the brain, and constructing practical artifacts that
build upon this understanding. For these novel chips
and hardware to be useful, hardware compatible
applications and simulation tools are needed. We argue
that the neural circuit approach, in which networks of
neuronal elements model brain circuitry are
constructed, allows the development of practical
applications and the exploration of brain function. At
this level of abstraction, networks of 10$^5$ neurons
or larger can be efficiently simulated, but still
preserve the neuronal and synaptic dynamics that appear
to be important for brain function. Because the neural
circuit level supports spiking neural networks and the
prevalent Addressable Event Representation (AER)
communication scheme, it fits well with many existing
neuromorphic hardware and simulation tools. To show how
this approach can be applied, we present case studies
of spiking neural networks in vision and recognition
tasks based on one instantiation of a simulation
environment. However, there are now many hardware
options, simulation environments, and applications in
this emerging field. These approaches and other
considerations are discussed.",
acknowledgement = ack-nhfb,
articleno = "36",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{ChappetDeVangel:2015:RSD,
author = "Beno{\^\i}t {Chappet De Vangel} and Cesar
Torres-huitzil and Bernard Girau",
title = "Randomly Spiking Dynamic Neural Fields",
journal = j-JETC,
volume = "11",
number = "4",
pages = "37:1--37:??",
month = apr,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2629517",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Tue Apr 28 05:59:37 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Bio-inspired neural computation attracts a lot of
attention as a possible solution for the future
challenges in designing computational resources.
Dynamic neural fields (DNF) provide cortically inspired
models of neural populations to which computation can
be applied for a wide variety of tasks, such as
perception and sensorimotor control. DNFs are often
derived from continuous neural field theory (CNFT). In
spite of the parallel structure and regularity of CNFT
models, few studies of hardware implementations have
been carried out targeting embedded real-time
processing. In this article, a hardware-friendly model
adapted from the CNFT is introduced, namely the RSDNF
model (randomly spiking dynamic neural fields). Thanks
to their simplified 2D structure, RSDNFs achieve
scalable parallel implementations on digital hardware
while maintaining the behavioral properties of CNFT
models. Spike-based computations within neurons in the
field are introduced to reduce interneuron connection
bandwidth. Additionally, local stochastic spike
propagation ensures inhibition and excitation broadcast
without a fully connected network. The behavioral
soundness and robustness of the model in the presence
of noise and distracters is fully validated through
software and hardware. A field programmable gate array
(FPGA) implementation shows how the RSDNF model ensures
a level of density and scalability out of reach for
previous hardware implementations of dynamic neural
field models.",
acknowledgement = ack-nhfb,
articleno = "37",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Kim:2015:RDN,
author = "Yongtae Kim and Yong Zhang and Peng Li",
title = "A Reconfigurable Digital Neuromorphic Processor with
Memristive Synaptic Crossbar for Cognitive Computing",
journal = j-JETC,
volume = "11",
number = "4",
pages = "38:1--38:??",
month = apr,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2700234",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Tue Apr 28 05:59:37 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "This article presents a brain-inspired reconfigurable
digital neuromorphic processor (DNP) architecture for
large-scale spiking neural networks. The proposed
architecture integrates an arbitrary number of N
digital leaky integrate-and-fire (LIF) silicon neurons
to mimic their biological counterparts and on-chip
learning circuits to realize spike-timing-dependent
plasticity (STDP) learning rules. We leverage memristor
nanodevices to build an N $ \times $ N crossbar array
to store not only multibit synaptic weight values but
also network configuration data with significantly
reduced area overhead. Additionally, the crossbar array
is designed to be accessible both column- and row-wise
to expedite the synaptic weight update process for
learning. The proposed digital pulse width modulator
(PWM) produces binary pulses with various durations for
reading and writing the multilevel memristive crossbar.
The proposed column based analog-to-digital conversion
(ADC) scheme efficiently accumulates the presynaptic
weights of each neuron and reduces silicon area
overhead by using a shared arithmetic unit to process
the LIF operations of all N neurons. With 256 silicon
neurons, learning circuits and 64K synapses, the power
dissipation and area of our DNP are 6.45 mW and 1.86
mm$^2$, respectively, when implemented in a 90-nm CMOS
technology. The functionality of the proposed DNP
architecture is demonstrated by realizing an
unsupervised-learning based character recognition
system.",
acknowledgement = ack-nhfb,
articleno = "38",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Daneshtalab:2015:SIE,
author = "Masoud Daneshtalab and Farhad Mehdipour and Zhiyi Yu
and Hannu Tenhunen",
title = "Special Issue on Emerging Many-Core Systems for
Exascale Computing",
journal = j-JETC,
volume = "11",
number = "4",
pages = "39:1--39:??",
month = apr,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2717312",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Tue Apr 28 05:59:37 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
acknowledgement = ack-nhfb,
articleno = "39",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Jafri:2015:AID,
author = "Syed M. A. H. Jafri and Ozan Ozbag and Nasim Farahini
and Kolin Paul and Ahmed Hemani and Juha Plosila and
Hannu Tenhunen",
title = "Architecture and Implementation of Dynamic
Parallelism, Voltage and Frequency Scaling {(PVFS)} on
{CGRAs}",
journal = j-JETC,
volume = "11",
number = "4",
pages = "40:1--40:??",
month = apr,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2700250",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Tue Apr 28 05:59:37 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "In the era of platforms hosting multiple applications
with arbitrary performance requirements, providing a
worst-case platform-wide voltage/frequency operating
point is neither optimal nor desirable. As a solution
to this problem, designs commonly employ dynamic
voltage and frequency scaling (DVFS). DVFS promises
significant energy and power reductions by providing
each application with the operating point (and hence
the performance) tailored to its needs. To further
enhance the optimization potential, recent works
interleave dynamic parallelism with conventional DVFS.
The induced parallelism results in performance gains
that allow an application to lower its operating point
even further (thereby saving energy and power
consumption). However, the existing works employ costly
dedicated hardware (for synchronization) and rely
solely on greedy algorithms to make parallelism
decisions. To efficiently integrate parallelism with
DVFS, compared to state-of-the-art, we exploit the
reconfiguration (to reduce DVFS synchronization
overheads) and enhance the intelligence of the greedy
algorithm (to make optimal parallelism decisions).
Specifically, our solution relies on dynamically
reconfigurable isolation cells and an autonomous
parallelism, voltage, and frequency selection
algorithm. The dynamically reconfigurable isolation
cells reduce the area overheads of DVFS circuitry by
configuring the existing resources to provide
synchronization. The autonomous parallelism, voltage,
and frequency selection algorithm ensures high power
efficiency by combining parallelism with DVFS. It
selects that parallelism, voltage, and frequency trio
which consumes minimum power to meet the deadlines on
available resources. Synthesis and simulation results
using various applications/algorithms (WLAN, MPEG4,
FFT, FIR, matrix multiplication) show that our solution
promises significant reduction in area and power
consumption (23\% and 51\% ) compared to
state-of-the-art.",
acknowledgement = ack-nhfb,
articleno = "40",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Olorode:2015:IPS,
author = "Oluleye Olorode and Mehrdad Nourani",
title = "Improving Performance in Sub-Block Caches with
Optimized Replacement Policies",
journal = j-JETC,
volume = "11",
number = "4",
pages = "41:1--41:??",
month = apr,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2668127",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Tue Apr 28 05:59:37 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Recent advances in computer processor design have led
to the introduction of sub-blocking to cache
architectures. Sub-block caches reduce the tag area and
power overhead in caches without reducing the effective
cache size by using fewer tags to index the full data
RAM array. In spite of achieving reduced area and power
overhead, sub-block caches suffer performance
degradation due to cache trashing. This occurs when a
wider cache line (super-block), made up of multiple
valid cache lines (sub-blocks), is replaced or evicted
when only a sub-block is to be allocated into the wider
super-block. To address this problem, we propose cache
replacement policies as they relate specifically to
sub-block caches. We propose new replacement policies
that are tuned for sub-block caches by adding more
intelligence based on the valid state of individual
sub-blocks of a super-block. We also investigate the
effect of using a few level-0 registers to bypass a few
level-1 cache pipe stages on sub-block cache
performance. To evaluate the performance improvement
offered by our proposed replacement policies and the
use of level-0 registers, we developed a sub-block
cache simulator based on the Simplescalar toolset for
single-core evaluations and the Sniper Simulator for
multicore evaluations. We show that, with minimal
architectural updates to existing conventional cache
replacement policies, we are able to improve level-1
cache hit rates by up to 4.17\% using our proposed
policies alone on SPEC2006 benchmarks and up to 14\% in
shared level-2 caches using multicore benchmark suites:
PARSEC and SPLASH2.",
acknowledgement = ack-nhfb,
articleno = "41",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Li:2015:ICI,
author = "Zhongqi Li and Nilanjan Goswami and Tao Li",
title = "{iConn}: a Communication Infrastructure for
Heterogeneous Computing Architectures",
journal = j-JETC,
volume = "11",
number = "4",
pages = "42:1--42:??",
month = apr,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2700238",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Tue Apr 28 05:59:37 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Recently, the graphics processing unit (GPU) has made
significant progress as a general-purpose parallel
processor. The CPU and GPU cooperate together to solve
data-parallel and control-intensive real-world
applications in an optimized fashion. For example,
emerging heterogeneous computing architectures such as
Intel Sandy Bridge and AMD Fusion integrate the
functionality of the CPU and GPU in a single die.
However, the single-die CPU-GPU heterogeneous computing
architecture faces the challenge of tight budget of die
area. The conventional homogeneous interconnect fails
to provide satisfactory performance by fully exploiting
the given area budget in the heterogeneous processing
era. In this article, we aim to implement an
interconnect network within an area budget for a
CPU-GPU heterogeneous computing architecture. We
propose iConn, a 2D mesh-style on-chip heterogeneous
communication infrastructure. In iConn, a set of GPU
logical units such as the stream processors, the
texture units, and the rendering output units form a
computing unit (CU). Differing from conventional
homogeneous router design, iConn adopts nonuniform
on-chip routers in order to meet the unique
communication demands from each single CPU and CU. The
routers can also dynamically allocate their buffers
across all virtual channels (VCs) to meet the latency
requirements of CPUs and CUs. Moreover, the memory
controller scheduling algorithm is modified from
traditional load-over-store scheduling in order to
prioritize the traffic. Our simulation results show
that iConn improves the performance of CPUs by 23.0\%
and CUs by 9.4\%.",
acknowledgement = ack-nhfb,
articleno = "42",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Khayambashi:2015:ARA,
author = "Misagh Khayambashi and Pooria M. Yaghini and Ashkan
Eghbal and Nader Bagherzadeh",
title = "Analytical Reliability Analysis of {$3$D} {NoC} under
{TSV} Failure",
journal = j-JETC,
volume = "11",
number = "4",
pages = "43:1--43:??",
month = apr,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2700236",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Tue Apr 28 05:59:37 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "The network-on-chip (NoC) technology allows for
integration of a manycore design on a single chip for
higher efficiency and scalability. Three-dimensional
(3D) NoCs offer several advantages over two-dimensional
(2D) NoCs. Through-silicon via (TSV) technology is one
of the candidates for implementation of 3D NoCs. TSV
reliability analysis is still challenging for 3D NoC
designers because of their unique electrical, thermal,
and physical characteristics. After providing an
overview of common TSV issues, this article aims to
define a reliability criterion for NoC and provide a
framework for quantifying this reliability as it
relates to TSV issues. TSV issues are modeled as a
time-invariant failure probability. Also, a reliability
criterion for TSV-based NoC is defined. The
relationship between NoC reliability and TSV failure is
quantified. For the first time, the reliability
criterion is reduced to a tractable closed-form
expression that requires a single Monte Carlo
simulation. Importantly, the Monte Carlo simulation
depends only on network geometry. To demonstrate our
proposed method, the reliability criterion of a simple
8$ \times $8$ \times $8 NoC supported by an 8$ \times
$8$ \times $7 network of TSVs is calculated.",
acknowledgement = ack-nhfb,
articleno = "43",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Pang:2015:MLN,
author = "Jun Pang and Christopher Dwyer and Alvin R. Lebeck",
title = "{mNoC}: Large Nanophotonic Network-on-Chip Crossbars
with Molecular Scale Devices",
journal = j-JETC,
volume = "12",
number = "1",
pages = "1:1--1:??",
month = jul,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2700241",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Tue Aug 4 07:26:23 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Moore's law and the continuity of device scaling have
led to an increasing number of cores/nodes on a chip,
creating a need for new mechanisms to achieve
high-performance and power-efficient Network-on-Chip
(NoC). Nanophotonics based NoCs provide for higher
bandwidth and more power efficient designs than
electronic networks. Present approaches often use an
external laser source, ring resonators, and waveguides.
However, they still suffer from important limitations:
large static power consumption, and limited network
scalability. In this article, we explore the use of
emerging molecular scale devices to construct
nanophotonic networks: Molecular-scale Network-on-Chip
(mNoC). We leverage on-chip emitters such as quantum
dot LEDs, which provide electrical to optical signal
modulation, and chromophores, which provide optical
signal filtering for receivers. These devices replace
the ring resonators and the external laser source used
in contemporary nanophotonic NoCs. They reduce energy
consumption or enable scaling to larger crossbars for a
reduced energy budget. We present a Single Writer
Multiple Reader (SWMR) bus based crossbar mNoC. Our
evaluation shows that an mNoC can achieve more than
88\% reduction in energy for a $ 64 \times 64 $
crossbar compared to similar ring resonator based
designs. Additionally, an mNoC can scale to a $ 256
\times 256 $ crossbar with an average 10\% performance
improvement and 54\% energy reduction.",
acknowledgement = ack-nhfb,
articleno = "1",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Hossain:2015:MGN,
author = "Nahid M. Hossain and Masud H. Chowdhury",
title = "Multilayer Graphene Nanoribbon and Carbon Nanotube
Based Floating Gate Transistor for Nonvolatile Flash
Memory",
journal = j-JETC,
volume = "12",
number = "1",
pages = "2:1--2:??",
month = jul,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2701428",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Tue Aug 4 07:26:23 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Floating gate transistor is the fundamental building
block of nonvolatile flash memory, which is one of the
most widely used memory gadgets in modern micro and
nano electronic applications. Recently there has been a
surge of interest to introduce a new generation of
memory devices using graphene nanotechnology. In this
article, we present a new floating gate transistor
(FGT) design based on multilayer graphene nanoribbon
(MLGNR) and carbon nanotube (CNT). In the proposed FGT,
a MLGNR structure would be used as the channel of the
field effect transistor (FET) and a layer of CNTs would
be used as the floating gate. We have performed an
analysis of the programming and erasing mechanism in
the floating gate and its dependence on the applied
control gate voltages. Based on our analysis we have
observed that proposed graphene based floating gate
transistor could be operated at a low voltage compared
to conventional silicon based floating gate devices. We
have presented detail analysis of the operation and the
programming and erasing processes of the proposed FGT;
the dependency of the programming and erasing current
density on different parameters; and the impact of
scaling the thicknesses of the control and tunneling
oxides. To perform these analyses we have developed
equivalent models for device capacitances.",
acknowledgement = ack-nhfb,
articleno = "2",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Ghofrani:2015:LPV,
author = "Amirali Ghofrani and Miguel-Angel Lastras-Monta{\~n}o
and Siddharth Gaba and Melika Payvand and Wei Lu and
Luke Theogarajan and Kwang-Ting Cheng",
title = "A Low-Power Variation-Aware Adaptive Write Scheme for
Access-Transistor-Free Memristive Memory",
journal = j-JETC,
volume = "12",
number = "1",
pages = "3:1--3:??",
month = jul,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2717313",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Tue Aug 4 07:26:23 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Recent advances in access-transistor-free memristive
crossbars have demonstrated the potential of memristor
arrays as high-density and ultra-low-power memory.
However, with considerable variations in the write-time
characteristics of individual memristors, conventional
fixed-pulse write schemes cannot guarantee reliable
completion of the write operations and waste
significant amount of energy. We propose an adaptive
write scheme that adaptively adjusts the write pulses
to address such variations in memristive arrays,
resulting in $ 7 \times $--$ 11 \times $ average energy
saving in our case studies. Our scheme embeds an online
monitor to detect the completion of a write operation
and takes into account the parasitic effect of
line-shared devices in access-transistor-free
crossbars. This feature also helps shorten the test
time of memory march algorithms by eliminating the need
of a verifying read right after a write, which is
commonly employed in the test sequences of march
algorithms.",
acknowledgement = ack-nhfb,
articleno = "3",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Venkatesan:2015:EEA,
author = "Rangharajan Venkatesan and Mrigank Sharad and Kaushik
Roy and Anand Raghunathan",
title = "Energy-Efficient All-Spin Cache Hierarchy Using
Shift-Based Writes and Multilevel Storage",
journal = j-JETC,
volume = "12",
number = "1",
pages = "4:1--4:??",
month = jul,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2723165",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Tue Aug 4 07:26:23 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Spintronic memories are considered to be promising
candidates for future on-chip memories due to their
high density, nonvolatility, and near-zero leakage.
However, they also face challenges such as high write
energy and latency and limited read speed due to
single-ended sensing. Further, the conflicting
requirements of read and write operations lead to
stringent design constraints that severely compromises
their benefits. Recently, domain wall memory was
proposed as a spintronic memory that has a potential
for very high density by storing multiple bits in the
domains of a ferromagnetic nanowire. While reliable
operation of DWM memory with multiple domains faces
many challenges, single-bit cells that utilize domain
wall motion for writes have been experimentally
demonstrated [Fukami et al. 2009]. This bit-cell, which
we refer to as Domain Wall Memory with Shift-based
Write (DWM-SW), achieves improved write efficiency and
features decoupled read-write paths, enabling
independent optimizations of read and write operations.
However, these benefits are achieved at the cost of
sacrificing the original goal of improved density. In
this work, we explore multilevel storage as a new
direction to enhance the density benefits of DWM-SW. At
the device level, we propose a new device--multilevel
DWM with shift-based write (ML-DWM-SW)--that is capable
of storing 2 bits in a single device. At the circuit
level, we propose a ML-DWM-SW based bit-cell design and
layout. The ML-DWM-SW bit-cell incurs no additional
area overhead compared to the DWM-SW bit-cell despite
storing an additional bit, thereby achieving roughly
twice the density. However, it requires a two-step
write operation and has data-dependent read and write
energies, which pose unique challenges. To address
these issues, we propose suitable architectural
optimizations: (i) intra-word interleaving and (ii) bit
encoding. We design ``all-spin'' cache architectures
using the proposed ML-DWM-SW bit-cell for both general
purpose processors as well as general purpose graphics
processing units (GPGPUs). We perform an iso-capacity
replacement of SRAM with spintronic memories and study
the energy and area benefits at iso-performance
conditions. For general purpose processors, the
ML-DWM-SW cache achieves 10X reduction in energy and
4.4X reduction in cache area compared to an SRAM cache
and 2X and 1.7X reduction in energy and area,
respectively, compared to an STT-MRAM cache. For
GPGPUs, the ML-DWM-SW cache achieves 5.3X reduction in
energy and 3.6X area reduction compared to SRAM and
3.5X energy reduction and 1.9X area reduction compared
to STT-MRAM.",
acknowledgement = ack-nhfb,
articleno = "4",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Park:2015:MME,
author = "Kyu Ho Park and Woomin Hwang and Hyunchul Seok and
Chulmin Kim and Dong-jae Shin and Dong Jin Kim and Min
Kyu Maeng and Seong Min Kim",
title = "{MN-MATE}: Elastic Resource Management of Manycores
and a Hybrid Memory Hierarchy for a Cloud Node",
journal = j-JETC,
volume = "12",
number = "1",
pages = "5:1--5:??",
month = jul,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2701429",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Tue Aug 4 07:26:23 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Recent advent of manycore system increases needs for
larger but faster memory hierarchy. Emerging next
generation memories such as on-chip DRAM and
nonvolatile memory (NVRAM) are promising candidates for
replacement of DRAM-only main memory. Combined with the
manycore trends, it gives an opportunity to rethink
conventional resource management system with a memory
hierarchy for a single cloud node. In an attempt to
mitigate the energy and memory problems, we propose
MN-MATE, an elastic resource management architecture
for a single cloud node with manycores, on-chip DRAM,
and large size of off-chip DRAM and NVRAM. In MN-MATE,
the hypervisor places consolidated VMs and balances
memory among them. Based on the monitored information
about the allocated memory, a guest OS co-schedules
tasks accessing different types of memory with
complementary access intensity. Polymorphic management
of DRAM hierarchy accelerates average memory access
speed inside each guest OS. A guest OS reduces energy
consumption with small performance loss based on the
NVRAM-aware data placement policy and the hybrid page
cache. A new lightweight kernel is developed to reduce
the overhead from the guest OS for scientific
applications. Experiment results show that our
techniques in MN-MATE platform improve system
performance and reduce energy consumption.",
acknowledgement = ack-nhfb,
articleno = "5",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Wang:2015:WAS,
author = "Jue Wang and Yuan Xie",
title = "A Write-Aware {STTRAM}-Based Register File
Architecture for {GPGPU}",
journal = j-JETC,
volume = "12",
number = "1",
pages = "6:1--6:??",
month = jul,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2700230",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Tue Aug 4 07:26:23 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "The massively parallel processing capacity of GPGPUs
requires a large register file (RF), and its size keeps
increasing to support more concurrent threads from
generation to generation. Using traditional SRAM-based
RFs, there are concerns in both area cost and energy
consumption, and soon they will become unrealistic. In
this work, we analyze the feasibility of using
STTRAM-based RF designs, which have benefits in terms
of smaller silicon area and zero standby leakage power.
However, STTRAM long write latency and high write
energy bring new challenges. Therefore, we propose a
write-aware STTRAM-based RF architecture (WarRF), which
contains two techniques: Split Bank Write modifies the
arbitrator design to increase the parallelism of read
and write accesses in the same bank; Write Pool reduces
the number of repeated write accesses to RFs. Our
experiment shows that the performance of STTRAM-based
RF is improved by 13\% and up to 23\% after adopting
WarRF. In addition, the energy consumption is reduced
by 38\% on average compared to SRAM-based RFs.",
acknowledgement = ack-nhfb,
articleno = "6",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Romani:2015:SSC,
author = "Aldo Romani and Matteo Filippi and Michele Dini and
Marco Tartagni",
title = "A Sub-$ \mu $ A Stand-By Current Synchronous Electric
Charge Extractor for Piezoelectric Energy Harvesting",
journal = j-JETC,
volume = "12",
number = "1",
pages = "7:1--7:??",
month = jul,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2700244",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Tue Aug 4 07:26:23 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "In the field of energy harvesting there is a growing
interest in power management circuits with intrinsic
sub-$ \mu $ A current consumptions, in order to operate
efficiently with very low levels of available power. In
this context, integrated circuits proved to be a viable
solution with high associated nonrecurring costs and
design risks. As an alternative, this article presents
a fully autonomous and battery-less circuit solution
for piezoelectric energy harvesting based on discrete
components in a low-cost PCB technology, which achieves
a comparable performance in a $ 32 \times 43 $ mm$^2$
footprint. The power management circuit implements
synchronous electric charge extraction (SECE) with a
passive bootstrap circuit from fully discharged states.
Circuit characterization showed that the circuit
consumes less than 1 $ \mu $A with a 3V output and may
achieve energy conversion efficiencies of up to 85\%.
In addition, the circuit is specifically designed for
operating with input and output voltages up to 20V,
which grants a significant flexibility in the choice of
transducers and energy storage capacitors.",
acknowledgement = ack-nhfb,
articleno = "7",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Jayakumar:2015:QHS,
author = "Hrishikesh Jayakumar and Arnab Raha and Woo Suk Lee
and Vijay Raghunathan",
title = "{QuickRecall}: a {HW\slash SW} Approach for Computing
across Power Cycles in Transiently Powered Computers",
journal = j-JETC,
volume = "12",
number = "1",
pages = "8:1--8:??",
month = jul,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2700249",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Tue Aug 4 07:26:23 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Transiently Powered Computers (TPCs) are a new class
of batteryless embedded systems that depend solely on
energy harvested from external sources for performing
computations. Enabling long-running computations on
TPCs is a major challenge due to the highly
intermittent nature of the power supply (often bursts
of {$<$} 100ms), resulting in frequent system reboots.
Prior work seeks to address this issue by frequently
checkpointing system state in flash memory, preserving
it across power cycles. However, this involves a
substantial overhead due to the high erase/write times
of flash memory. This article proposes the use of
Ferroelectric RAM (FRAM), an emerging nonvolatile
memory technology that combines the benefits of SRAM
and flash, to seamlessly enable long-running
computations in TPCs. We propose a lightweight, in-situ
checkpointing technique for TPCs using FRAM that
consumes only 30 nJ while decreasing the time taken for
saving and restoring a checkpoint to only 21.06 $ \mu $
s, which is over two orders of magnitude lower than the
corresponding overhead using flash. We have implemented
and evaluated our technique, QuickRecall, using the TI
MSP430FR5739 FRAM-enabled microcontroller. Experimental
results show that our highly-efficient checkpointing
translate to significant speedup ($ 1.25 \times $--$
8.4 \times $) in program execution time and reduction
($ \approx 3 \times $) in application-level energy
consumption.",
acknowledgement = ack-nhfb,
articleno = "8",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Chien:2015:FTO,
author = "Chia-Hung Chien and Rodney {Van Meter} and Sy-Yen
Kuo",
title = "Fault-Tolerant Operations for Universal Blind Quantum
Computation",
journal = j-JETC,
volume = "12",
number = "1",
pages = "9:1--9:??",
month = jul,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2700248",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Tue Aug 4 07:26:23 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Blind quantum computation is an appealing use of
quantum information technology because it can conceal
both the client's data and the algorithm itself from
the server. However, problems need to be solved in the
practical use of blind quantum computation and
fault-tolerance is a major challenge. Broadbent et al.
proposed running error correction over blind quantum
computation, and Morimae and Fujii proposed using
fault-tolerant entangled qubits as the resource for
blind quantum computation. Both approaches impose
severe demands on the teleportation channel, the former
requiring unrealistic data rates and the latter
near-perfect fidelity. To extend the application range
of blind quantum computation, we suggest that Alice
send input qubits encoded with error correction code
instead of single input qubits. Two fault-tolerant
protocols are presented and we showed the trade-off of
the computational overhead using the ten-bit quantum
carry-lookahead adder as an example. Though these two
fault-tolerant protocols require the client to have
more quantum computing ability than using approaches
from prior work, they provide better fault-tolerance
when the client and the server are connected by
realistic quantum repeater networks.",
acknowledgement = ack-nhfb,
articleno = "9",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Cheng:2015:SSC,
author = "Ching-Hwa Cheng",
title = "{SCKVdd}: a Scalable Clock-Controlled Self-Stabilized
Voltage Technique for Low Power {CMOS} Digital
Circuits",
journal = j-JETC,
volume = "12",
number = "1",
pages = "10:1--10:??",
month = jul,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2790754",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Tue Aug 4 07:26:23 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "It has been proposed that small amounts of energy
dissipate when transfer through a rising Vdd. In
typical power gate circuits, the PMOS transistors
(P$_{SW}$ ) reduce the leakage of power by shutting off
outer Vdd to the idle blocks. We expand this technique
by utilizing active P$_{SW}$, which are turned on and
off by a clock signal. The proposed SCKVdd technique
combines the power source gated mechanism and clock
signal to generate stable progressive rising voltage to
suppress peak and average currents effectively. The
SCKVdd technique is a scalable, clock-controlled,
self-stabilized voltage technique. This technique is
easily implemented in generic digital circuits to
reduce power dissipation. A normal CMOS circuit shows a
dynamic power consumption increase proportional to the
clock frequency. SCKVdd results in a lower-than-usual
frequency dependency, and is suitable for high speed
clock circuits. SCKVdd can be integrated with
frequency, voltage scaling and an activated P$_{SW}$
number to implement an efficient power-performance
trade-off mechanism. In experiments that investigated
constant Vdd for MPEG VLD chips, power dissipation
savings were in the range of 42\% to 54\% with only a
small delay penalty.",
acknowledgement = ack-nhfb,
articleno = "10",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Todri-Sanial:2015:GES,
author = "Aida Todri-Sanial and Sanjukta Bhanja",
title = "Guest Editorial: Special Issue on Advances in Design
of Ultra-Low Power Circuits and Systems in Emerging
Technologies",
journal = j-JETC,
volume = "12",
number = "2",
pages = "11:1--11:??",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2756554",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Tue Sep 8 18:25:16 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
acknowledgement = ack-nhfb,
articleno = "11",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Gaillardon:2015:SLP,
author = "Pierre-Emmanuel Gaillardon and Edith Beigne and
Suzanne Lesecq and Giovanni {De Micheli}",
title = "A Survey on Low-Power Techniques with Emerging
Technologies: From Devices to Systems",
journal = j-JETC,
volume = "12",
number = "2",
pages = "12:1--12:??",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2714566",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Tue Sep 8 18:25:16 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Nowadays, power consumption is one of the main
limitations of electronic systems. In this context,
novel and emerging devices provide new opportunities to
extend the trend toward low-power design. In this
survey article, we present a transversal survey on
energy-efficient techniques ranging from devices to
architectures. The actual trends of device research,
with fully depleted planar devices, tri-gate
geometries, and gate-all-around structures, allows us
to reach an increasingly higher level of performance
while reducing the associated power. In addition,
beyond the simple device property enhancements,
emerging devices also lead to innovations at the
circuit and architectural levels. In particular,
devices whose properties can be tuned through
additional terminals enable a fine and dynamic control
of device threshold. They also enable designers to
realize logic gates and to implement power-related
techniques in a compact way unreachable to standard
technologies. These innovations reduce power
consumption at the gate level and unlock new means of
actuation in architectural solutions like adaptive
voltage and frequency scaling.",
acknowledgement = ack-nhfb,
articleno = "12",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Sitik:2015:FBL,
author = "Can Sitik and Emre Salman and Leo Filippini and Sung
Jun Yoon and Baris Taskin",
title = "{FinFET}-Based Low-Swing Clocking",
journal = j-JETC,
volume = "12",
number = "2",
pages = "13:1--13:??",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2701617",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Tue Sep 8 18:25:16 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "A low-swing clocking methodology is introduced to
achieve low-power operation at 20nm FinFET technology.
Low-swing clock trees are used in existing
methodologies in order to decrease the dynamic power
consumption in a trade-off for 3 issues: (1) the effect
of leakage power consumption, which is becoming more
dominant when the process scales sub-32nm; (2) the
increase in insertion delay, resulting in a high clock
skew; and (3) the difficulty in driving the existing
DFF sinks with a low-swing clock signal without a
timing violation. In this article, a FinFET-based
low-swing clocking methodology is introduced to
preserve the dynamic power savings of low-swing
clocking while minimizing these three negative effects,
facilitated through an efficient use of FinFET
technology. At scaled performance constraints, the
proposed methodology at 20nm FinFET leads to 42\% total
power savings (clock network+DFF) compared to a
FinFET-based full-swing counterpart at the same
frequency (3 GHz), thanks to the dynamic power savings
of low-swing clocking and 3\% power savings compared to
a CMOS-based low-swing implementation running at the
half frequency (1.5 GHz), thanks to the leakage power
savings of FinFET technology.",
acknowledgement = ack-nhfb,
articleno = "13",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Zhang:2015:DCP,
author = "Tiansheng Zhang and Jie Meng and Ayse K. Coskun",
title = "Dynamic Cache Pooling in {$3$D} Multicore Processors",
journal = j-JETC,
volume = "12",
number = "2",
pages = "14:1--14:??",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2700247",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Tue Sep 8 18:25:16 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Resource pooling, where multiple architectural
components are shared among cores, is a promising
technique for improving system energy efficiency and
reducing total chip area. 3D stacked multicore
processors enable efficient pooling of cache resources
owing to the short interconnect latency between
vertically stacked layers. This article first
introduces a 3D multicore architecture that provides
poolable cache resources. We then propose a runtime
management policy to improve energy efficiency in 3D
systems by utilizing the flexible heterogeneity of
cache resources. Our policy dynamically allocates jobs
to cores on the 3D system while partitioning cache
resources based on cache hungriness of the jobs. We
investigate the impact of the proposed cache resource
pooling architecture and management policy in 3D
systems, both with and without on-chip DRAM. We
evaluate the performance, energy efficiency, and
thermal behavior for a wide range of workloads running
on 3D systems. Experimental results demonstrate that
the proposed architecture and policy reduce system
energy-delay product (EDP) and energy-delay-area
product (EDAP) by 18.8\% and 36.1\% on average,
respectively, in comparison to 3D processors with
static cache sizes.",
acknowledgement = ack-nhfb,
articleno = "14",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Khasanvis:2015:LPH,
author = "Santosh Khasanvis and K. M. Masum Habib and Mostafizur
Rahman and Roger Lake and Csaba Andras Moritz",
title = "Low-Power Heterogeneous Graphene Nanoribbon-{CMOS}
Multistate Volatile Memory Circuit",
journal = j-JETC,
volume = "12",
number = "2",
pages = "15:1--15:??",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2700233",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Tue Sep 8 18:25:16 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Graphene is an emerging nanomaterial believed to be a
potential candidate for post-Si nanoelectronics due to
its exotic properties. Recently, a new graphene
nanoribbon crossbar (xGNR) device was proposed which
exhibits negative differential resistance (NDR). In
this article, a multistate memory design is presented
that can store multiple bits in a single cell enabled
by this xGNR device, called graphene nanoribbon
tunneling random access memory (GNTRAM). An approach to
increase the number of bits per cell is explored
alternative to physical scaling to overcome CMOS SRAM
limitations. A comprehensive design for quaternary
GNTRAM is presented as a baseline, implemented with a
heterogeneous integration between graphene and CMOS.
Sources of leakage and approaches to mitigate them are
investigated. This design is extensively benchmarked
against 16nm CMOS SRAMs and 3T DRAM. The proposed
quaternary cell shows up to 2.27$ \times $ density
benefit versus 16nm CMOS SRAMs and 1.8$ \times $ versus
3T DRAM. It has comparable read performance and is
power efficient up to 1.32$ \times $ during active
period and 818$ \times $ during standby against
high-performance SRAMs. Multistate GNTRAM has the
potential to realize high-density low-power nanoscale
embedded memories. Further improvements may be possible
by using graphene more extensively, as graphene
transistors become available in the future.",
acknowledgement = ack-nhfb,
articleno = "15",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Kang:2015:SEU,
author = "Wang Kang and Yue Zhang and Zhaohao Wang and
Jacques-Olivier Klein and Claude Chappert and
Dafin{\'e} Ravelosona and Gefei Wang and Youguang Zhang
and Weisheng Zhao",
title = "Spintronics: Emerging Ultra-Low-Power Circuits and
Systems beyond {MOS} Technology",
journal = j-JETC,
volume = "12",
number = "2",
pages = "16:1--16:??",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2663351",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Tue Sep 8 18:25:16 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Conventional MOS integrated circuits and systems
suffer serve power and scalability challenges as
technology nodes scale into ultra-deep-micron
technology nodes (e.g., below 40nm). Both static and
dynamic power dissipations are increasing, caused
mainly by the intrinsic leakage currents and large data
traffic. Alternative approaches beyond
charge-only-based electronics, and in particular,
spin-based devices, show promising potential to
overcome these issues by adding the spin freedom of
electrons to electronic circuits. Spintronics provides
data non-volatility, fast data access, and low-power
operation, and has now become a hot topic in both
academia and industry for achieving ultra-low-power
circuits and systems. The ITRS report on emerging
research devices identified the magnetic tunnel
junction (MTJ) nanopillar (one of the Spintronics
nanodevices) as one of the most promising technologies
to be part of future micro-electronic circuits. In this
review we will give an overview of the status and
prospects of spin-based devices and circuits that are
currently under intense investigation and development
across the world, and address particularly their merits
and challenges for practical applications. We will also
show that, with a rapid development of Spintronics,
some novel computing architectures and paradigms beyond
classic Von-Neumann architecture have recently been
emerging for next-generation ultra-low-power circuits
and systems.",
acknowledgement = ack-nhfb,
articleno = "16",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Azghadi:2015:PST,
author = "Mostafa Rahimi Azghadi and Saber Moradi and Daniel B.
Fasnacht and Mehmet Sirin Ozdas and Giacomo Indiveri",
title = "Programmable Spike-Timing-Dependent Plasticity
Learning Circuits in Neuromorphic {VLSI}
Architectures",
journal = j-JETC,
volume = "12",
number = "2",
pages = "17:1--17:??",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2658998",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Tue Sep 8 18:25:16 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Hardware implementations of spiking neural networks
offer promising solutions for computational tasks that
require compact and low-power computing technologies.
As these solutions depend on both the specific network
architecture and the type of learning algorithm used,
it is important to develop spiking neural network
devices that offer the possibility to reconfigure their
network topology and to implement different types of
learning mechanisms. Here we present a neuromorphic
multi-neuron VLSI device with on-chip programmable
event-based hybrid analog/digital circuits; the
event-based nature of the input/output signals allows
the use of address-event representation infrastructures
for configuring arbitrary network architectures, while
the programmable synaptic efficacy circuits allow the
implementation of different types of spike-based
learning mechanisms. The main contributions of this
article are to demonstrate how the programmable
neuromorphic system proposed can be configured to
implement specific spike-based synaptic plasticity
rules and to depict how it can be utilised in a
cognitive task. Specifically, we explore the
implementation of different spike-timing plasticity
learning rules online in a hybrid system comprising a
workstation and when the neuromorphic VLSI device is
interfaced to it, and we demonstrate how, after
training, the VLSI device can perform as a standalone
component (i.e., without requiring a computer), binary
classification of correlated patterns.",
acknowledgement = ack-nhfb,
articleno = "17",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Graziano:2015:PVE,
author = "Mariagrazia Graziano and Azzurra Pulimeno and Ruiyu
Wang and Xiang Wei and Massimo Ruo Roch and Gianluca
Piccinini",
title = "Process Variability and Electrostatic Analysis of
Molecular {QCA}",
journal = j-JETC,
volume = "12",
number = "2",
pages = "18:1--18:??",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2738041",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Tue Sep 8 18:25:16 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Molecular quantum-dot cellular automata (mQCA) is an
emerging paradigm for nanoscale computation. Its
revolutionary features are the expected operating
frequencies (THz), the high device densities, the
noncryogenic working temperature, and, above all, the
limited power densities. The main drawback of this
technology is a consequence of one of its very main
advantages, that is, the extremely small size of a
single molecule. Device prototyping and the fabrication
of a simple circuit are limited by lack of control in
the technological process [Pulimeno et al. 2013a].
Moreover, high defectivity might strongly impact the
correct behavior of mQCA devices. Another challenging
point is the lack of a solid method for analyzing and
simulating mQCA behavior and performance, either in
ideal or defective conditions. Our contribution in this
article is threefold: (i) We identify a methodology
based on both ab-initio simulations and post-processing
of data for analyzing an mQCA system adopting an
electronic point of view (we baptized this method as
``MoSQuiTo''); (ii) we assess the performance of an
mQCA device (in this case, a bis- ferrocene molecule)
working in nonideal conditions, using as a reference
the information on fabrication-critical issues and on
the possible defects that we are obtaining while
conducting our own ongoing experiments on mQCA: (iii)
we determine and assess the electrostatic energy stored
in a bis-ferrocene molecule both in an oxidized and
reduced form. Results presented here consist of
quantitative information for an mQCA device working in
manifold driving conditions and subjected to defects.
This information is given in terms of: (a) output
voltage; (b) safe operating area (SOA); (c)
electrostatic energy; and (d) relation between SOA and
energy, that is, possible energy reduction subject to
reliability and functionality constraints. The whole
analysis is a first fundamental step toward the study
of a complex mQCA circuit. It gives important
suggestions on possible improvements of the
technological processes. Moreover, it starts an
interesting assessment on the energy of an mQCA, one of
the most promising features of this technology.",
acknowledgement = ack-nhfb,
articleno = "18",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Le:2015:END,
author = "Trong Nhan Le and Alain Pegatoquet and Olivier Berder
and Olivier Sentieys and Arnaud Carer",
title = "Energy-Neutral Design Framework for
Supercapacitor-Based Autonomous Wireless Sensor
Networks",
journal = j-JETC,
volume = "12",
number = "2",
pages = "19:1--19:??",
month = aug,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2787512",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Tue Sep 8 18:25:16 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "To design autonomous wireless sensor networks (WSNs)
with a theoretical infinite lifetime, energy harvesting
(EH) techniques have been recently considered as
promising approaches. Ambient sources can provide
everlasting additional energy for WSN nodes and exclude
their dependence on battery. In this article, an
efficient energy harvesting system which is compatible
with various environmental sources, such as light,
heat, or wind energy, is proposed. Our platform takes
advantage of double-level capacitors not only to
prolong system lifetime but also to enable robust
booting from the exhausting energy of the system.
Simulations and experiments show that our
multiple-energy-sources converter (MESC) can achive
booting time in order of seconds. Although capacitors
have virtual recharge cycles, they suffer higher
leakage compared to rechargeable batteries. Increasing
their size can decrease the system performance due to
leakage energy. Therefore, an energy-neutral design
framework providing a methodology to determine the
minimum size of those storage devices satisfying
energy-neutral operation (ENO) and maximizing system
quality-of-service (QoS) in EH nodes, when using a
given energy source, is proposed. Experiments
validating this framework are performed on a real WSN
platform with both photovoltaic cells and thermal
generators in an indoor environment. Moreover,
simulations on OMNET++ show that the energy storage
optimized from our design framework is utilized up to
93.86\%.",
acknowledgement = ack-nhfb,
articleno = "19",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Shi:2015:ISI,
author = "Yiyu Shi and Takashi Sato",
title = "Introduction to: Special Issue on Cross-Layer System
Design",
journal = j-JETC,
volume = "12",
number = "3",
pages = "20:1--20:??",
month = sep,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2767131",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Tue Sep 22 17:30:11 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
acknowledgement = ack-nhfb,
articleno = "20",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{De:2015:ASC,
author = "Vivek K. De and Andrew B. Kahng and Tanay Karnik and
Bao Liu and Milad Maleki and Lu Wang",
title = "Application-Specific Cross-Layer Optimization Based on
Predictive Variable-Latency {VLSI} Design",
journal = j-JETC,
volume = "12",
number = "3",
pages = "21:1--21:??",
month = sep,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2746341",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Tue Sep 22 17:30:11 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Traditional synchronous VLSI design requires that all
computations in a logic stage complete in one clock
cycle. This leads to increasingly pessimistic design as
technology scaling introduces increasingly significant
parametric variations that result in an increasing
performance variability. Alternatively, by allowing
computations in a logic stage to complete in a variable
number of clock cycles, variable-latency design
provides relaxed timing constraints for average
performance, area, and power consumption optimization.
In this article, we present improved variable-latency
design techniques including: (1) a generic
minimum-intrusion variable-latency VLSI design
paradigm, (2) a signal probability-based approximate
prediction logic construction method for minimum
misprediction rate at minimum cost, and (3) an
application-specific cross-layer analysis methodology.
Our experiments show that the proposed variable-latency
design methodology on average reduces the computation
latency by 26.80\%(14.65\%) at cost of 0.08\%(3.4\%)
area and 0.4\%(2.2\%) energy consumption increase for
the integer (floating point) unit of an open-source
SPARC V8 processor LEON2 synthesized with a clock-cycle
time between 1.97ns(3.49ns) and 5.96ns(13.74ns) based
on the 45nm Nangate open cell library, while an
automotive application-specific design further achieves
an average latency reduction of 41.8\%.",
acknowledgement = ack-nhfb,
articleno = "21",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Patnaik:2015:PPC,
author = "Milan Patnaik and Chidhambaranathan R. and Chirag Garg
and Arnab Roy and V. R. Devanathan and Shankar
Balachandran and V. Kamakoti",
title = "{ProWATCh}: a Proactive Cross-Layer Workload-Aware
Temperature Management Framework for Low-Power Chip
Multi-Processors",
journal = j-JETC,
volume = "12",
number = "3",
pages = "22:1--22:??",
month = sep,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2753762",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Tue Sep 22 17:30:11 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "With the increase in process variations and diversity
in workloads, it is imperative to holistically explore
optimization techniques for power and temperature from
the circuit layer right up to the compiler/ operating
system (OS) layer. This article proposes one such
holistic technique, called proactive workload aware
temperature management framework for low-power chip
multi-processors (ProWATCh). At the compiler level
ProWATCh includes two techniques: (1) a novel compiler
design for estimating the architectural parameters of a
task at compile time; and (2) a model-based technique
for dynamic estimation of architectural parameters at
runtime. At the OS level ProWATCh integrates two
techniques: (1) a workload- and temperature-aware
process manager for dynamic distribution of tasks to
different cores; and (2) a model predictive
control-based task scheduler for generating the
efficient sequence of task execution. At the circuit
level ProWATCh implements either of two techniques: (1)
a workload-aware voltage manager for dynamic supply and
body bias voltage assignment for a given frequency in
processors that support adaptive body bias (ABB); or
(2) a workload-aware frequency governor for efficient
assignment of upper and lower frequency bounds for
frequency scaling in processors that do not support an
ABB. Employing ProWATCh (with voltage manager) on an
ABB-compatible 3D OpenSPARC architecture using MiBench
benchmarks resulted in an average 18\% (19C) reduction
in peak temperature. Evaluating ProWATCh on an existing
quad-core Intel Corei7 processor with frequency
governor alone (as the processor does not support an
ABB interface) resulted in 10\% (8C) reduction in peak
temperature when compared to what was obtained using
the native Linux 3.0 completely fair scheduler (CFS).
To study the effectiveness of the proposed framework
across benchmark suites, ProWATCh was evaluated on a
quad-core Intel Corei7 processor using CPU SPEC 2006
benchmarks which resulted in 7C reduction in peak
temperature as compared to the native Linux 3.0 CFS.",
acknowledgement = ack-nhfb,
articleno = "22",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Zhao:2015:STD,
author = "Chenyuan Zhao and Bryant T. Wysocki and Yifang Liu and
Clare D. Thiem and Nathan R. McDonald and Yang Yi",
title = "Spike-Time-Dependent Encoding for Neuromorphic
Processors",
journal = j-JETC,
volume = "12",
number = "3",
pages = "23:1--23:??",
month = sep,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2738040",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Tue Sep 22 17:30:11 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "This article presents our research towards developing
novel and fundamental methodologies for data
representation using spike-timing-dependent encoding.
Time encoding efficiently maps a signal's amplitude
information into a spike time sequence that represents
the input data and offers perfect recovery for
band-limited stimuli. In this article, we pattern the
neural activities across multiple timescales and encode
the sensory information using time-dependent temporal
scales. The spike encoding methodologies for autonomous
classification of time-series signatures are explored
using near-chaotic reservoir computing. The proposed
spiking neuron is compact, low power, and robust. A
hardware implementation of these results is expected to
produce an agile hardware implementation of time
encoding as a signal conditioner for dynamical neural
processor designs.",
acknowledgement = ack-nhfb,
articleno = "23",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Barke:2015:CLA,
author = "Martin Barke and Ulf Schlichtmann",
title = "A Cross-Layer Approach to Measure the Robustness of
Integrated Circuits",
journal = j-JETC,
volume = "12",
number = "3",
pages = "24:1--24:??",
month = sep,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2743022",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Tue Sep 22 17:30:11 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "The demands on system robustness and its immunity
against perturbations are getting increasingly
important. Nearly everybody has an intuitive
understanding of what robustness means, but there is no
proper way how to measure robustness of integrated
circuits already during the design phase. Therefore, a
general cross-layer robustness model and methods to
quantitatively measure robustness are presented.
Moreover, these methods are refined to predict the
robustness against degradation of digital circuits due
to aging effects.",
acknowledgement = ack-nhfb,
articleno = "24",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Zhuo:2015:CLA,
author = "Cheng Zhuo and Houle Gan and Wei-Kai Shih and Alaeddin
A. Aydiner",
title = "A Cross-Layer Approach for Early-Stage Power Grid
Design and Optimization",
journal = j-JETC,
volume = "12",
number = "3",
pages = "25:1--25:??",
month = sep,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2700246",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Tue Sep 22 17:30:11 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Power integrity has become increasingly important for
sub-32nm designs. Many prior works have discussed power
grid design and optimization in the post-layout stage,
when design change is inevitably expensive and
difficult. In contrast, during the early stage of a
development cycle, designers have more flexibility to
improve the design quality. However, there are several
fundamental challenges at early stage when the design
database is not complete, including extraction,
modeling, and optimization. This article tackles these
fundamental issues of early-stage power grid design
from architecture to layout. The proposed methods have
been silicon validated on 32nm on-market chips and
successfully applied to a 22nm design for its
early-stage power grid design. The findings from such
practices reveal that, for sub-32nm chips, an intrinsic
on-die capacitance and power gate scheme may have more
significant impact than expected on power integrity,
and needs to be well addressed at early stage.",
acknowledgement = ack-nhfb,
articleno = "25",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Lee:2015:REE,
author = "Jinho Lee and Kyungsu Kang and Kiyoung Choi",
title = "{REDELF}: an Energy-Efficient Deadlock-Free Routing
for {$3$D} {NoCs} with Partial Vertical Connections",
journal = j-JETC,
volume = "12",
number = "3",
pages = "26:1--26:??",
month = sep,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2751560",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Tue Sep 22 17:30:11 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "3D integrated circuits (3D ICs) using through-silicon
vias (TSVs) allow to envision the stacking of dies with
different functions and technologies, using as an
interconnect backbone a 3D network-on-chip (NoC).
However, partial vertical connection in 3D NoCs seems
unavoidable because of the large overhead of TSV itself
(e.g., large footprint, low fabrication yield,
additional fabrication processes) as well as the
heterogeneity in dimension. This article proposes an
energy-efficient deadlock-free routing algorithm for 3D
mesh topologies where vertical connections partially
exist. By introducing some rules for selecting
elevators (i.e., vertical links between dies), the
routing algorithm can eliminate the dedicated virtual
channel requirement. In this article, the rules
themselves as well as the proof of deadlock freedom are
given. By eliminating the virtual channels for deadlock
avoidance, the proposed routing algorithm reduces the
energy consumption by 38.9\% compared to a conventional
routing algorithm. When the virtual channel is used for
reducing the head-of-line blocking, the proposed
routing algorithm increases performance by up to 23.1\%
and 6.9\% on average.",
acknowledgement = ack-nhfb,
articleno = "26",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Zoni:2015:MDP,
author = "Davide Zoni and William Fornaciari",
title = "Modeling {DVFS} and Power-Gating Actuators for
Cycle-Accurate {NoC}-Based Simulators",
journal = j-JETC,
volume = "12",
number = "3",
pages = "27:1--27:??",
month = sep,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2751561",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Tue Sep 22 17:30:11 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Networks-on-chip (NoCs) are a widely recognized viable
interconnection paradigm to support the multi-core
revolution. One of the major design issues of multicore
architectures is still the power, which can no longer
be considered mainly due to the cores, since the NoC
contribution to the overall energy budget is relevant.
To face both static and dynamic power while balancing
NoC performance, different actuators have been
exploited in literature, mainly dynamic voltage
frequency scaling (DVFS) and power gating. Typically,
simulation-based tools are employed to explore the huge
design space by adopting simplified models of the
components. As a consequence, the majority of
state-of-the-art on NoC power-performance optimization
do not accurately consider timing and power overheads
of actuators, or (even worse) do not consider them at
all, with the risk of overestimating the benefits of
the proposed methodologies. This article presents a
simulation framework for power-performance analysis of
multicore architectures with specific focus on the NoC.
It integrates accurate power gating and DVFS models
encompassing also their timing and power overheads. The
value added of our proposal is manyfold: (i) DVFS and
power gating actuators are modeled starting from
SPICE-level simulations; (ii) such models have been
integrated in the simulation environment; (iii) policy
analysis support is plugged into the framework to
enable assessment of different policies; (iv) a
flexible GALS ( globally asynchronous locally
synchronous ) support is provided, covering both
handshake and FIFO re-synchronization schemas. To
demonstrate both the flexibility and extensibility of
our proposal, two simple policies exploiting the
modeled actuators are discussed in the article.",
acknowledgement = ack-nhfb,
articleno = "27",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Chen:2015:GPF,
author = "Xianmin Chen and Niraj K. Jha",
title = "{gem5-PVT}: a Framework for {FinFET} System Simulation
under {PVT} Variations",
journal = j-JETC,
volume = "12",
number = "3",
pages = "28:1--28:??",
month = sep,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2755564",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Tue Sep 22 17:30:11 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "FinFET has begun replacing CMOS at the 22nm technology
node and beyond. Compared to planar CMOS, FinFET has a
higher on-current and lower leakage due to its
double-gate structure. A FinFET-based system simulation
framework can be very helpful to system architects for
early-stage design-space exploration using this new
technology. However, such a simulator does not exist.
We fill this gap by presenting the details of one such
simulation framework, called gem5-PVT, that we have
developed. Our simulation framework combines and
extends existing lower-level FinFET simulators to
support timing, power, and thermal studies of
FinFET-based chip multiprocessor systems under process,
voltage, and temperature (PVT) variations. It uses a
bottom-up modeling approach based on logic/memory cell
libraries that have been very accurately characterized
using TCAD device simulation. This allows accuracy to
bubble up to the system level. The framework is modular
and automated, hence enables system designers the
flexibility to evaluate various system implementations.
It is currently targeted at the 22nm FinFET technology.
We report results for two case studies to demonstrate
its usefulness. One study shows that more than 62.1$
\times $ system-level leakage reduction, at the same
performance, is possible when using a particular FinFET
logic style. Another study characterizes core-to-core
frequency and power variations that result from
underlying PVT variations and compares the
effectiveness of variation-aware scheduling schemes.",
acknowledgement = ack-nhfb,
articleno = "28",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Bahreini:2015:MMS,
author = "Tayebeh Bahreini and Naser Mohammadzadeh",
title = "An {MINLP} Model for Scheduling and Placement of
Quantum Circuits with a Heuristic Solution Approach",
journal = j-JETC,
volume = "12",
number = "3",
pages = "29:1--29:??",
month = sep,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2766452",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Tue Sep 22 17:30:11 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Recent works on quantum physical design have pushed
the scheduling and placement of quantum circuit into
their prominent positions. In this article, a mixed
integer nonlinear programming model is proposed for the
placement and scheduling of quantum circuits in such a
way that latency is minimized. The proposed model
determines locations of gates and the sequence of
operations. The proposed model is proved reducible to a
quadratic assignment problem which is a well-known
NP-complete combinatorial optimization problem. Since
it is impossible to find the optimal solution of this
NP-complete problem for large quantum circuits within a
reasonable amount of time, a metaheuristic solution
method is developed for the proposed model. Some
experiments are conducted to evaluate the performance
of the developed solution approach. Experimental
results show that the proposed approach improves
average latency by about 24.09\% for the attempted
benchmarks.",
acknowledgement = ack-nhfb,
articleno = "29",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Rahman:2015:NVR,
author = "Mostafizur Rahman and Santosh Khasanvis and Csaba
Andras Moritz",
title = "Nanowire Volatile {RAM} as an Alternative to {SRAM}",
journal = j-JETC,
volume = "12",
number = "3",
pages = "30:1--30:??",
month = sep,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2714567",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Tue Sep 22 17:30:11 MDT 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Maintaining benefits of CMOS technology scaling is
becoming challenging, primarily due to increased
manufacturing complexities and unwanted passive power
dissipations. This is particularly challenging in SRAM,
where manufacturing precision and leakage power control
are critical issues. To alleviate these challenges, we
proposed a novel volatile memory alternative to SRAM
called nanowire volatile RAM (NWRAM). Due to NWRAM's
regular grid-based layout and innovative circuit style,
manufacturing complexities are reduced and, at the same
time, considerable benefits are attained in terms of
performance and leakage power reduction. In this
article we elaborate NWRAM's circuit aspects and
manufacturability, and quantify benefits at 16nm
technology node through simulation against
state-of-the-art 6T-SRAM and gridded 8T-SRAM designs.
Our results show that when lower bounds in design rules
are considered, 10T-NWRAM's read and write time are
1.38x and 2x faster, and the leakage power is 14x
better in comparison to high-performance 6T-SRAM.
Similarly the 10T-NWRAM achieves 1.3x and 1.9x read and
write performance, and 35x leakage power improvements
compared to high-performance 8T-SRAM. 10T-NWRAM's
density is comparable to 6T-SRAM and 8T-SRAM for lower
bounds, but exhibits higher active power in similar
comparisons. This article details all benchmarking
results and provides thorough analysis of NWRAM's
evaluations.",
acknowledgement = ack-nhfb,
articleno = "30",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Khouzani:2016:FEP,
author = "Hoda Aghaei Khouzani and Yuan Xue and Chengmo Yang",
title = "Fully Exploiting {PCM} Write Capacity Within Near Zero
Cost Through Segment-Based Page Allocation",
journal = j-JETC,
volume = "12",
number = "4",
pages = "31:1--31:??",
month = jul,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2856423",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Thu Dec 1 09:26:07 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Improving the endurance of phase change memory (PCM)
is a fundamental issue when PCM technology is
considered as an alternative to main memory usage.
Existing wear-leveling techniques overcome this
challenge through constantly remapping hot virtual
pages, thus engendering a fair amount of extra write
operations to PCM and imposing considerable performance
and energy overhead. Our observation is that it is
unnecessary to fully balance the accesses to different
physical page frames during the execution of each
process. Instead, since endurance is a lifetime factor,
the hot virtual pages of different processes can be
mapped to different physical pages in the PCM.
Leveraging this property, we develop a wear-resistant
page allocation algorithm, which exploits the diverse
write characteristics of different program segments to
improve PCM write endurance within almost no extra
remapping cost in terms of energy and performance. The
results of experiments conducted based on SPEC
benchmarks show that the proposed technique can prolong
PCM lifetime by hundreds of times within nearly zero
searching and remapping overhead.",
acknowledgement = ack-nhfb,
articleno = "31",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Layer:2016:RSP,
author = "Christophe Layer and Laurent Becker and Kotb Jabeur
and Sylvain Claireux and Bernard Dieny and Guillaume
Prenat and Gregory {Di Pendina} and Stephane Gros and
Pierre Paoli and Virgile Javerliac and Fabrice
Bernard-Granger and Loic Decloedt",
title = "Reducing System Power Consumption Using Check-Pointing
on Nonvolatile Embedded Magnetic Random Access
Memories",
journal = j-JETC,
volume = "12",
number = "4",
pages = "32:1--32:??",
month = jul,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2876507",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Thu Dec 1 09:26:07 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "The most widely used embedded memory technology,
static random access memory (SRAM), is heading toward
scaling problems in advanced technology nodes due to
the leakage currents caused by the quantum tunneling
effect. As an alternative, spin-transfer torque
magnetic RAM (STT-MRAM) technology shows comparable
performance in terms of speed and power consumption and
much better performance in terms of density and
leakage. Moreover, MRAM brings up new paradigms in
system design thanks to its inherent nonvolatility,
which allows the definition of new instant-on/off
policies and leakage current optimization. Based on our
compact model, we have developed a fully characterized
system-on-chip from the basic cell up to the system
architecture in a 40nm LP hybrid CMOS/magnetic process.
Through simulations, first we demonstrate that STT-MRAM
is a candidate for the memory part of embedded systems,
and second we implement a check-pointing methodology
based on the regular interrupt routines of a processor
to enable a fast power on and off functionality. Using
a synthetic benchmark developed in high-level
programming languages intended to be representative of
integer system performance, our method shows that
having MRAM instead of SRAM in an embedded design
brings up important energy savings. The influence of
the check-pointing routine on power consumption is
finally evaluated with regard to various shutdown and
restart behaviors.",
acknowledgement = ack-nhfb,
articleno = "32",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Wu:2016:RCA,
author = "Chengwen Wu and Guangyan Zhang and Keqin Li",
title = "Rethinking Computer Architectures and Software Systems
for Phase-Change Memory",
journal = j-JETC,
volume = "12",
number = "4",
pages = "33:1--33:40",
month = jul,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2893186",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Thu Dec 1 09:26:07 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "With dramatic growth of data and rapid enhancement of
computing powers, data accesses become the bottleneck
restricting overall performance of a computer system.
Emerging phase-change memory (PCM) is byte-addressable
like DRAM, persistent like hard disks and Flash SSD,
and about four orders of magnitude faster than hard
disks or Flash SSDs for typical file system I/Os. The
maturity of PCM from research to production provides a
new opportunity for improving the I/O performance of a
system. However, PCM also has some weaknesses, for
example, long write latency, limited write endurance,
and high active energy. Existing processor cache
systems, main memory systems, and online storage
systems are unable to leverage the advantages of PCM,
and/or to mitigate PCM's drawbacks. The reason behind
this incompetence is that they are designed and
optimized for SRAM, DRAM memory, and hard drives,
respectively, instead of PCM memory. There have been
some efforts concentrating on rethinking computer
architectures and software systems for PCM. This
article presents a detailed survey and review of the
areas of computer architecture and software systems
that are oriented to PCM devices. First, we identify
key technical challenges that need to be addressed
before this memory technology can be leveraged, in the
form of processor cache, main memory, and online
storage, to build high-performance computer systems.
Second, we examine various designs of computer
architectures and software systems that are PCM aware.
Finally, we obtain several helpful observations and
propose a few suggestions on how to leverage PCM to
optimize the performance of a computer system.",
acknowledgement = ack-nhfb,
articleno = "33",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Deb:2016:RSS,
author = "Arighna Deb and Debesh K. Das and Hafizur Rahaman and
Robert Wille and Rolf Drechsler and Bhargab B.
Bhattacharya",
title = "Reversible Synthesis of Symmetric Functions with a
Simple Regular Structure and Easy Testability",
journal = j-JETC,
volume = "12",
number = "4",
pages = "34:1--34:??",
month = jul,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2894757",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Thu Dec 1 09:26:07 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "In this article, we introduce a novel method of
synthesizing symmetric Boolean functions with
reversible logic gates. In contrast to earlier
approaches, the proposed technique deploys a simple,
regular, and cascaded structure consisting of an array
of Peres and CNOT gates, which results in significant
reduction with respect to the quantum cost. However,
the number of circuit inputs may increase slightly when
such cascades are used. In order to reduce their
number, we next propose a postsynthesis optimization
phase that allows judicious reuse of circuit lines. In
addition to offering a cost-effective synthesis
methodology, the proposed reversible logic structure
supports elegant testability properties. With respect
to all single or partial missing gate faults (SMGFs and
PMGFs), or repeated gate faults (RGFs) in such an n
-input circuit module, we show that it admits a
universal test set of constant cardinality (=3) for any
value of n. Thus, considering both the cost and
testability issues, this approach provides a superior
option for synthesizing symmetric functions compared to
existing designs.",
acknowledgement = ack-nhfb,
articleno = "34",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Wang:2016:NPM,
author = "Qian Wang and Yongtae Kim and Peng Li",
title = "Neuromorphic Processors with Memristive Synapses:
Synaptic Interface and Architectural Exploration",
journal = j-JETC,
volume = "12",
number = "4",
pages = "35:1--35:??",
month = jul,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2894756",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Thu Dec 1 09:26:07 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Due to their nonvolatile nature, excellent
scalability, and high density, memristive nanodevices
provide a promising solution for low-cost on-chip
storage. Integrating memristor-based synaptic crossbars
into digital neuromorphic processors (DNPs) may
facilitate efficient realization of brain-inspired
computing. This article investigates architectural
design exploration of DNPs with memristive synapses by
proposing two synapse readout schemes. The key design
tradeoffs involving different analog-to-digital
conversions and memory accessing styles are thoroughly
investigated. A novel storage strategy optimized for
feedforward neural networks is proposed in this work,
which greatly reduces the energy and area cost of the
memristor array and its peripherals.",
acknowledgement = ack-nhfb,
articleno = "35",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Biswas:2016:IFW,
author = "Kalyan Biswas and Angsuman Sarkar and Chandan Kumar
Sarkar",
title = "Impact of Fin Width Scaling on {RF}\slash Analog
Performance of Junctionless Accumulation-Mode Bulk
{FinFET}",
journal = j-JETC,
volume = "12",
number = "4",
pages = "36:1--36:??",
month = jul,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2903143",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Thu Dec 1 09:26:07 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "In this article, the RF and analog performance of
junctionless accumulation-mode bulk FinFETs is analyzed
by employing the variation of fin width so that it can
be used as a high-efficiency RF integrated circuit
design. The RF/analog performance evaluation has been
carried out using the ATLAS 3D device simulator in
terms of evaluation of figure-of-merits metrics such as
transconductance (g$_m$ ), gate-to-source/drain
capacitances (C$_{gg}$ ), cutoff frequency (f$_T$ ),
and maximum frequency of oscillation (f$_{max}$ ).
Apart from RF/analog performance investigation, the
variation of ON-current to OFF-current ratio (I$_{ON}$
/I$_{OFF}$ ) and transconductance generation factor
(g$_m$ /I$_{ds}$ ) have also been carried out. From
this study, it is observed that smaller fin width of
the device improves its performance.",
acknowledgement = ack-nhfb,
articleno = "36",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Chen:2016:AMS,
author = "Yi-Hang Chen and Jian-Yu Chen and Juinn-Dar Huang",
title = "Area Minimization Synthesis for Reconfigurable
Single-Electron Transistor Arrays with Fabrication
Constraints",
journal = j-JETC,
volume = "12",
number = "4",
pages = "37:1--37:??",
month = jul,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2906360",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Thu Dec 1 09:26:07 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Power dissipation has become a pressing issue of
concern in the designs of most electronic system as
fabrication processes enter even deeper submicron
regions. More specifically, leakage power plays a
dominant role in system power dissipation. An emerging
circuit design style, the reconfigurable
single-electron transistor (SET) array, has been
proposed for continuing Moore's Law due to its
ultra-low leakage power consumption. Recently, several
works have been proposed to address the issues related
to automated synthesis for the reconfigurable SET
array. Nevertheless, all of those existing approaches
consider mandatory fabrication constraints of SET array
merely in late synthesis stages. In this article, we
propose a synthesis algorithm, featuring input-variable
ordering and dynamic product term ordering, for area
minimization. The fabrication constraints are taken
into account at every synthesis stage of proposed flow
to guarantee better synthesis outcomes. We also develop
a simulated annealing-based postprocess to find a
proper phase assignment of each input variable for
further area reduction. Experimental results show that
our new methodology can achieve up to 29\% area
reduction as compared to existing state-of-the-art
techniques.",
acknowledgement = ack-nhfb,
articleno = "37",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Kim:2016:CAP,
author = "Moon Seok Kim and William Cane-Wissing and Xueqing Li
and Jack Sampson and Suman Datta and Sumeet Kumar Gupta
and Vijaykrishnan Narayanan",
title = "Comparative Area and Parasitics Analysis in {FinFET}
and Heterojunction Vertical {TFET} Standard Cells",
journal = j-JETC,
volume = "12",
number = "4",
pages = "38:1--38:??",
month = jul,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2914790",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Thu Dec 1 09:26:07 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Vertical tunnel field-effect transistors (VTFETs) have
been extensively explored to overcome the scaling
limits and to improve on-current (I$_{ON}$) compared to
standard lateral device structures for the future
technologies. The benefits in terms of reduced
footprint, high I$_{ON}$ and feasibility of fabrication
have been demonstrated in several works. Among various
VTFETs, the asymmetric heterojunction vertical tunnel
FETs (HVTFETs) have emerged as one of the promising
alternatives to standard transistors for low-voltage
applications. However, while such device-level benefits
without parasitics have been widely investigated,
logic-gate design with parasitics and layout
implications are not clear. In this article, we
investigate and compare the layouts and parasitic
capacitances and resistances of HVTFETs with FinFETs.
Due to the vertical device structure of HVTFETs, a
smaller footprint is observed compared to FinFETs in
cells with small fan-in. However, for high fan-in
cells, HVTFETs exhibit area overheads due to
infeasibility of contact sharing in parallel and series
transistors. These area overheads also lead to
approximately 48\% higher parasitic capacitance and
resistance compared to FinFETs when the number of
parallel and series connections increases. Further, in
order to analyze the impact of parasitics, we modeled
the analytical parasitics in SPICE. The models for both
HVTFETs and FinFETs with parasitics were used to
simulate a 15-stage inverter-based ring oscillator (RO)
in order to compare the delay and energy. Our
simulation results clearly show that HVTFETs exhibit
less delay at a $ V_{DD} < 0.45 V$ and higher energy
efficiency for $ V_{DDs}$ in the range of 0.3V--0.7V,
albeit at the cost of 8\% performance degradation.",
acknowledgement = ack-nhfb,
articleno = "38",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Ahsan:2016:DMQ,
author = "Muhammad Ahsan and Rodney {Van Meter} and Jungsang
Kim",
title = "Designing a Million-Qubit Quantum Computer Using a
Resource Performance Simulator",
journal = j-JETC,
volume = "12",
number = "4",
pages = "39:1--39:??",
month = jul,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2830570",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Thu Dec 1 09:26:07 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "The optimal design of a fault-tolerant quantum
computer involves finding an appropriate balance
between the burden of large-scale integration of noisy
components and the load of improving the reliability of
hardware technology. This balance can be evaluated by
quantitatively modeling the execution of quantum logic
operations on a realistic quantum hardware containing
limited computational resources. In this work, we
report a complete performance simulation software tool
capable of (1) searching the hardware design space by
varying resource architecture and technology
parameters, (2) synthesizing and scheduling a
fault-tolerant quantum algorithm within the hardware
constraints, (3) quantifying the performance metrics
such as the execution time and the failure probability
of the algorithm, and (4) analyzing the breakdown of
these metrics to highlight the performance bottlenecks
and visualizing resource utilization to evaluate the
adequacy of the chosen design. Using this tool, we
investigate a vast design space for implementing key
building blocks of Shor's algorithm to factor a
1,024-bit number with a baseline budget of 1.5 million
qubits. We show that a trapped-ion quantum computer
designed with twice as many qubits and one-tenth of the
baseline infidelity of the communication channel can
factor a 2,048-bit integer in less than 5 months.",
acknowledgement = ack-nhfb,
articleno = "39",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Arabzadeh:2016:QLS,
author = "Mona Arabzadeh and Mahboobeh Houshmand and Mehdi
Sedighi and Morteza Saheb Zamani",
title = "Quantum-Logic Synthesis of {Hermitian} Gates",
journal = j-JETC,
volume = "12",
number = "4",
pages = "40:1--40:??",
month = jul,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2794263",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Thu Dec 1 09:26:07 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "In this article, the problem of synthesizing a general
Hermitian quantum gate into a set of primary quantum
gates is addressed. To this end, an extended version of
the Jacobi approach for calculating the eigenvalues of
Hermitian matrices in linear algebra is considered as
the basis of the proposed synthesis method. The quantum
circuit synthesis method derived from the Jacobi
approach and its optimization challenges are described.
It is shown that the proposed method results in
multiple-control rotation gates around the y axis,
multiple-control phase shift gates, multiple-control
NOT gates, and a middle diagonal Hermitian matrix,
which can be synthesized to multiple-control Pauli Z
gates. Using the proposed approach, it is shown how
multiple-control U gates, where U is a single-qubit
Hermitian quantum gate, can be implemented using a
linear number of elementary gates in terms of circuit
lines with the aid of one auxiliary qubit in an
arbitrary state.",
acknowledgement = ack-nhfb,
articleno = "40",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Soeken:2016:ELB,
author = "Mathias Soeken and Robert Wille and Oliver Keszocze
and D. Michael Miller and Rolf Drechsler",
title = "Embedding of Large {Boolean} Functions for Reversible
Logic",
journal = j-JETC,
volume = "12",
number = "4",
pages = "41:1--41:??",
month = jul,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2786982",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Thu Dec 1 09:26:07 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Reversible logic represents the basis for many
emerging technologies and has recently been intensively
studied. However, most of the Boolean functions of
practical interest are irreversible and must be
embedded into a reversible function before they can be
synthesized. Thus far, an optimal embedding is
guaranteed only for small functions, whereas a
significant overhead results when large functions are
considered. We study this issue in this article. We
prove that determining an optimal embedding is
coNP-hard already for restricted cases. Then, we
propose heuristic and exact methods for determining
both the number of additional lines and a corresponding
embedding. For the approaches, we considered sum of
products and binary decision diagrams as function
representations. Experimental evaluations show the
applicability of the approaches for large functions.
Consequently, the reversible embedding of large
functions is enabled as a precursor to subsequent
synthesis.",
acknowledgement = ack-nhfb,
articleno = "41",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Tang:2016:DPM,
author = "Aoxiang Tang and Xun Gao and Lung-Yen Chen and Niraj
K. Jha",
title = "Delay\slash Power Modeling and Optimization of
{FinFET} Circuit Modules under {PVT} Variations:
Observing the Trends between the 22nm and 14nm
Technology Nodes",
journal = j-JETC,
volume = "12",
number = "4",
pages = "42:1--42:??",
month = jul,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2795231",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Thu Dec 1 09:26:07 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "The semiconductor industry has moved to FinFETs
because of their superior ability to mitigate
short-channel effects relative to CMOS. Thus, good
FinFET delay and power models are urgently needed to
facilitate FinFET IC design at the upcoming technology
nodes. Another urgent problem that needs to be
addressed with continued technology scaling is how to
analyze circuit performance and power consumption under
process, voltage, and temperature (PVT) variations.
Such variations arise due to limitations of lithography
that lead to variations in the physical dimensions of
the device or due to environmental variations. In this
article, we propose a delay/power modeling framework
for analysis of FinFET logic circuits under PVT
variations. We present models for FinFET logic gates
and three FinFET SRAM cells. We use GenFin, which is a
genetic algorithm based statistical circuit-level
delay/power optimizer, to produce the models for
functional units (FUs) employed in a processor. We
compare the impact of PVT variations at the 22nm and
14nm FinFET technology nodes. We evaluate cache
performance for various cache capacities and
temperatures as well as that of FUs. Our device
simulation results show that the $ 3 \sigma / \mu $
spread for 14nm circuits is, on average, 38.5\% higher
in dynamic power and 21.4\% higher in logarithm of
leakage power relative to 22nm FinFET circuits.
However, the delay spread depends on the circuit.",
acknowledgement = ack-nhfb,
articleno = "42",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Chaudhuri:2016:ULL,
author = "Sourindra M. Chaudhuri and Niraj K. Jha",
title = "Ultra-Low-Leakage and High-Performance Logic Circuit
Design Using Multiparameter Asymmetric {FinFETs}",
journal = j-JETC,
volume = "12",
number = "4",
pages = "43:1--43:??",
month = jul,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2832913",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Thu Dec 1 09:26:07 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Recently, multigate field-effect transistors have
started replacing traditional planar MOSFETs to keep
pace with Moore's Law in deep submicron technology.
Among different multigate transistors, FinFETs have
become the preferred choice of the semiconductor
industry owing to low fabrication cost, superior
performance, lower leakage, and design flexibility. The
back and front gates of a FinFET can either be shorted
or remain independent, leading to two modes of
operation: Shorted-Gate (SG) and Independent-Gate (IG).
For a given mode of operation, the physical parameters
of the FinFET can either be symmetric or asymmetric in
nature. In this article, for the first time, we analyze
multiparameter asymmetric SG FinFETs and illustrate
their potential for implementing logic gates and
circuits that are both ultra-low-leakage and
high-performance simultaneously. We restrict this work
to SG devices because IG FinFETs (symmetric/asymmetric)
suffer from severely degraded on-current, which makes
them unattractive for high-performance designs. We
first compare head-to-head all viable single- and
multiparameter symmetric/asymmetric SG FinFETs. Among
all such FinFETs, the traditional SG (which are
symmetric in nature), Asymmetric Workfunction
Shorted-Gate (AWSG), and Asymmetric
Workfunction-Underlap Shorted-Gate (AWUSG) FinFETs show
the most promise. We characterize these devices under
process variations in gate length $ (L_G) $, fin
thickness $ (T_{SI}) $, gate-oxide thickness $ (T_{OX})
$, gate underlap $ (L_{UN}) $, and gate-workfunction $
(\Phi_G) $ as well as supply voltage $ (V_{DD}) $
variations, followed by a gate-level leakage/delay
analysis at different temperatures. Although AWSG
FinFETs consume very low leakage power, they do suffer
from performance degradation relative to SG FinFETs.
Similarly, our study reveals that no other
single-parameter asymmetric FinFET provides a good
combination of low-power and high-performance design.
We show that gates/circuits based on AWUSG FinFETs are
faster, yet consume much less leakage power and less
area than gates/circuits based on traditional SG
FinFETs. We observe 53.4\% (30.2\%) maximum (average)
reduction in total power at temperature $ T = 348 $K
while meeting the same delay constraint, with 14.2\%
(13.5\%) reduction in area for AWUSG circuits relative
to SG circuits. At $ T = 373 $K, we see 68.6\% (46.9\%)
maximum (average) reduction in total power.",
acknowledgement = ack-nhfb,
articleno = "43",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Beuningen:2016:PPR,
author = "Anja {Von Beuningen} and Luca Ramini and Davide
Bertozzi and Ulf Schlichtmann",
title = "{PROTON+}: a Placement and Routing Tool for {$3$D}
Optical Networks-on-Chip with a Single Optical Layer",
journal = j-JETC,
volume = "12",
number = "4",
pages = "44:1--44:??",
month = jul,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2830716",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Thu Dec 1 09:26:07 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Optical Networks-on-Chip (ONoCs) are a promising
technology to overcome the bottleneck of low bandwidth
of electronic Networks-on-Chip. Recent research
discusses power and performance benefits of ONoCs based
on their system-level design, while layout effects are
typically overlooked. As a consequence, laser power
requirements are inaccurately computed from the logic
scheme but do not consider the layout. In this article,
we propose PROTON+, a fast tool for placement and
routing of 3D ONoCs minimizing the total laser power.
Using our tool, the required laser power of the system
can be decreased by up to 94\% compared to a
state-of-the-art manually designed layout. In addition,
with the help of our tool, we study the physical design
space of ONoC topologies. For this purpose, topology
synthesis methods (e.g., global connectivity and
network partitioning) as well as different objective
function weights are analyzed in order to minimize the
maximum insertion loss and ultimately the system's
laser power consumption. For the first time, we study
optimal positions of memory controllers. A comparison
of our algorithm to a state-of-the-art placer for
electronic circuits shows the need for a different set
of tools custom-tailored for the particular
requirements of optical interconnects.",
acknowledgement = ack-nhfb,
articleno = "44",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Dehghani:2016:NAO,
author = "Abbas Dehghani and Kamal Jamshidi",
title = "A Novel Approach to Optimize Fault-Tolerant Hybrid
Wireless Network-on-Chip Architectures",
journal = j-JETC,
volume = "12",
number = "4",
pages = "45:1--45:??",
month = jul,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2814572",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Thu Dec 1 09:26:07 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Wireless Network-on-Chip (WNoC) architectures have
emerged as a promising interconnection infrastructure
to address the performance limitations of traditional
wire-based multihop NOCs. Nevertheless, the WNoC
systems encounter high failure rates due to problems
pertaining to integration and manufacturing of wireless
interconnection in nano-domain technology. As a result,
the permanent failures may lead to the formation of any
shape of faulty regions in the interconnection network,
which can break down the whole system. This issue is
not investigated in previous studies on WNoC
architectures. Our solution advocates the adoption of
communication structures with both node and link on
disjoint paths. On the other hand, the imposed costs of
WNoC design must be reasonable. Hence, a novel approach
to design an optimized fault-tolerant hybrid
hierarchical WNoC architecture for enhancing
performance as well as minimizing system costs is
proposed. The experimental results indicate that the
robustness of this newly proposed design is
significantly enhanced in comparison with its the
fault-tolerant wire-based counterparts in the presence
of various faulty regions under both synthetic and
application-specific traffic patterns.",
acknowledgement = ack-nhfb,
articleno = "45",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Mittal:2016:SAT,
author = "Sparsh Mittal",
title = "A Survey of Architectural Techniques for
Near-Threshold Computing",
journal = j-JETC,
volume = "12",
number = "4",
pages = "46:1--46:??",
month = jul,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2821510",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Thu Dec 1 09:26:07 MST 2016",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Energy efficiency has now become the primary obstacle
in scaling the performance of all classes of computing
systems. Low-voltage computing, specifically,
near-threshold voltage computing (NTC), which involves
operating the transistor very close to and yet above
its threshold voltage, holds the promise of providing
many-fold improvement in energy efficiency. However,
use of NTC also presents several challenges such as
increased parametric variation, failure rate, and
performance loss. This article surveys several recent
techniques that aim to offset these challenges for
fully leveraging the potential of NTC. By classifying
these techniques along several dimensions, we also
highlight their similarities and differences. It is
hoped that this article will provide insights into
state-of-the-art NTC techniques to researchers and
system designers and inspire further research in this
field.",
acknowledgement = ack-nhfb,
articleno = "46",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Sinanoglu:2016:GES,
author = "Ozgur Sinanoglu and Ramesh Karri",
title = "Guest Editorial Special Issue on Secure and
Trustworthy Computing",
journal = j-JETC,
volume = "13",
number = "1",
pages = "1:1--1:??",
month = dec,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2898433",
ISSN = "1550-4832",
bibdate = "Sat Apr 8 10:16:07 MDT 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
acknowledgement = ack-nhfb,
articleno = "1",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Das:2016:MPU,
author = "Jayita Das and Kevin Scott and Sanjukta Bhanja",
title = "{MRAM PUF}: Using Geometric and Resistive Variations
in {MRAM} Cells",
journal = j-JETC,
volume = "13",
number = "1",
pages = "2:1--2:??",
month = dec,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2854154",
ISSN = "1550-4832",
bibdate = "Sat Apr 8 10:16:07 MDT 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "In this work, we have studied two novel techniques to
enhance the performance of existing geometry-based
magnetoresistive RAM physically unclonable function
(MRAM PUF). Geometry-based MRAM PUFs rely only on
geometric variations in MRAM cells that generate
preferred ground state in cells and form the basis of
digital signature generation. Here we study two novel
ways to improve the performance of the geometry-based
PUF signature. First, we study how the choice between
specific geometries can enhance the reliability of the
digital signature. Using fabrications and simulations,
we study how the rectangular shape in the PUF cells is
more susceptible to lithography-based geometric
variations than the elliptical shape of the same aspect
ratio. The choice of rectangular over elliptical masks
in the lithography process can therefore improve the
reliability of the digital signature from PUF. Second,
we present a MRAM PUF architecture and study how
resistances in MRAM cells can be used to generate
analog voltage output that are easier to detect if
probed by an adversary. In the new PUF architecture, we
have the choice between selection of rows and columns
to generate unique and hard-to-predict analog voltage
outputs. For a 64-bit response, the analog voltage
output can range between 20 and 500 mV, making it tough
for an adversary to guess over this wide range of
voltages. This work ends with a discussion on the
threat resilience ability of the new improved MRAM PUF
to attacks from probing-, tampering-, reuse-, and
simulation-based models.",
acknowledgement = ack-nhfb,
articleno = "2",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Bi:2016:ETB,
author = "Yu Bi and Kaveh Shamsi and Jiann-Shiun Yuan and
Pierre-Emmanuel Gaillardon and Giovanni {De Micheli}
and Xunzhao Yin and X. Sharon Hu and Michael Niemier
and Yier Jin",
title = "Emerging Technology-Based Design of Primitives for
Hardware Security",
journal = j-JETC,
volume = "13",
number = "1",
pages = "3:1--3:??",
month = dec,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2816818",
ISSN = "1550-4832",
bibdate = "Sat Apr 8 10:16:07 MDT 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Hardware security concerns such as intellectual
property (IP) piracy and hardware Trojans have
triggered research into circuit protection and
malicious logic detection from various design
perspectives. In this article, emerging technologies
are investigated by leveraging their unique properties
for applications in the hardware security domain.
Security, for the first time, will be treated as one
design metric for emerging nano-architecture. Five
example circuit structures including camouflaging
gates, polymorphic gates, current/voltage-based circuit
protectors, and current-based XOR logic are designed to
show the high efficiency of silicon nanowire FETs and
graphene SymFET in applications such as circuit
protection and IP piracy prevention. Simulation results
indicate that highly efficient and secure circuit
structures can be achieved via the use of non-CMOS
devices.",
acknowledgement = ack-nhfb,
articleno = "3",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Iyengar:2016:SPS,
author = "Anirudh Iyengar and Swaroop Ghosh and Kenneth Ramclam
and Jae-Won Jang and Cheng-Wei Lin",
title = "Spintronic {PUFs} for Security, Trust, and
Authentication",
journal = j-JETC,
volume = "13",
number = "1",
pages = "4:1--4:??",
month = dec,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2809781",
ISSN = "1550-4832",
bibdate = "Sat Apr 8 10:16:07 MDT 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "We propose spintronic physically unclonable functions
(PUFs) to exploit security-specific properties of
domain wall memory (DWM) for security, trust, and
authentication. We note that the nonlinear dynamics of
domain walls (DWs) in the physical magnetic system is
an untapped source of entropy that can be leveraged for
hardware security. The spatial and temporal randomness
in the physical system is employed in conjunction with
microscopic and macroscopic properties such as
stochastic DW motion, stochastic pinning/depinning, and
serial access to realize novel relay-PUF and memory-PUF
designs. The proposed PUFs show promising results ($
\approx $50\% interdie Hamming distance (HD) and 10\%
to 20\% intradie HD) in terms of randomness, stability,
and resistance to attacks. We have investigated
noninvasive attacks, such as machine learning and
magnetic field attack, and have assessed the PUFs
resilience.",
acknowledgement = ack-nhfb,
articleno = "4",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Vatajelu:2016:SMB,
author = "Elena Ioana Vatajelu and Giorgio {Di Natale} and Mario
Barbareschi and Lionel Torres and Marco Indaco and
Paolo Prinetto",
title = "{STT--MRAM}-Based {PUF} Architecture Exploiting
Magnetic Tunnel Junction Fabrication-Induced
Variability",
journal = j-JETC,
volume = "13",
number = "1",
pages = "5:1--5:??",
month = dec,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2790302",
ISSN = "1550-4832",
bibdate = "Sat Apr 8 10:16:07 MDT 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Physically Unclonable Functions (PUFs) are emerging
cryptographic primitives used to implement low-cost
device authentication and secure secret key generation.
Weak PUF's (i.e., devices able to generate a single
signature or to deal with a limited number of
challenges) are widely discussed in literature. One of
the most investigated solutions today is based on
SRAMs. However, the rapid development of low-power,
high-density, high-performance SoCs has pushed the
embedded memories to their limits and opened the field
to the development of emerging memory technologies. The
Spin-Transfer-Torque Magnetic Random Access Memory
(STT-MRAM) has emerged as a promising choice for
embedded memories due to its reduced read/write latency
and high CMOS integration capability. In this article,
we propose an innovative PUF design based on STT-MRAM
memory. We exploit the high variability affecting the
electrical resistance of the Magnetic Tunnel Junction
(MTJ) device in anti-parallel magnetization. We will
demonstrate that the proposed solution is robust,
unclonable, and unpredictable.",
acknowledgement = ack-nhfb,
articleno = "5",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Quadir:2016:SCS,
author = "Shahed E. Quadir and Junlin Chen and Domenic Forte and
Navid Asadizanjani and Sina Shahbazmohamadi and Lei
Wang and John Chandy and Mark Tehranipoor",
title = "A Survey on Chip to System Reverse Engineering",
journal = j-JETC,
volume = "13",
number = "1",
pages = "6:1--6:??",
month = dec,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2755563",
ISSN = "1550-4832",
bibdate = "Sat Apr 8 10:16:07 MDT 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "The reverse engineering (RE) of electronic chips and
systems can be used with honest and dishonest
intentions. To inhibit RE for those with dishonest
intentions (e.g., piracy and counterfeiting), it is
important that the community is aware of the
state-of-the-art capabilities available to attackers
today. In this article, we will be presenting a survey
of RE and anti-RE techniques on the chip, board, and
system levels. We also highlight the current challenges
and limitations of anti-RE and the research needed to
overcome them. This survey should be of interest to
both governmental and industrial bodies whose critical
systems and intellectual property (IP) require
protection from foreign enemies and counterfeiters who
possess advanced RE capabilities.",
acknowledgement = ack-nhfb,
articleno = "6",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Castro:2016:FVB,
author = "Stephan {De Castro} and Jean-Max Dutertre and Bruno
Rouzeyre and Giorgio {Di Natale} and Marie-Lise
Flottes",
title = "Frontside Versus Backside Laser Injection: a
Comparative Study",
journal = j-JETC,
volume = "13",
number = "1",
pages = "7:1--7:??",
month = dec,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2845999",
ISSN = "1550-4832",
bibdate = "Sat Apr 8 10:16:07 MDT 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "The development of cryptographic devices was followed
by the development of so-called implementation attacks,
which are intended to retrieve secret information
exploiting the hardware itself. Among these attacks,
fault attacks can be used to disturb the circuit while
performing a computation to retrieve the secret. Among
possible means of injecting a fault, laser beams have
proven to be accurate and powerful. The laser can be
used to illuminate the circuit either from its
frontside (i.e., where metal interconnections are first
encountered) or from the backside (i.e., through the
substrate). Historically, frontside injection was
preferred because it does not require the die to be
thinned. Nevertheless, due to the increasing
integration of metal layers in modern technologies,
frontside injections do not allow targeting of any
desired location. Indeed, metal lines act as mirrors,
and they reflect and refract most of the energy
provided by the laser beam. Conversely, backside
injections, although more difficult to set up, allow an
increase of the resolution of the target location and
remove the drawbacks of the frontside technique. This
article compares experimental results from frontside
and backside fault injections. The effectiveness of the
two techniques is measured in terms of exploitable
errors on an AES circuit (i.e., errors that can be used
to extract the value of the secret key used during the
encryption process). We will show, conversely to what
is generally assumed, that frontside injection can
provide even better results compared to backside
injection, especially for low-cost beams with a large
laser spot.",
acknowledgement = ack-nhfb,
articleno = "7",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Barenghi:2016:FBS,
author = "Alessandro Barenghi and Guido M. Bertoni and Luca
Breveglieri and Gerardo Pelosi and Stefano Sanfilippo
and Ruggero Susella",
title = "A Fault-Based Secret Key Retrieval Method for {ECDSA}:
Analysis and Countermeasure",
journal = j-JETC,
volume = "13",
number = "1",
pages = "8:1--8:??",
month = dec,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2767132",
ISSN = "1550-4832",
bibdate = "Sat Apr 8 10:16:07 MDT 2017",
bibsource = "http://www.acm.org/pubs/contents/journals/jetc/;
https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Elliptic curve cryptosystems proved to be well suited
for securing systems with constrained resources like
embedded and portable devices. In a fault-based attack,
errors are induced during the computation of a
cryptographic primitive, and the results are collected
to derive information about the secret key safely
stored in the device. We introduce a novel attack
methodology to recover the secret key employed in
implementations of the Elliptic Curve Digital Signature
Algorithm. Our attack exploits the information leakage
induced when altering the execution of the modular
arithmetic operations used in the signature primitive
and does not rely on the underlying elliptic curve
mathematical structure, thus being applicable to all
standardized curves. We provide both a validation of
the feasibility of the attack, even employing common
off-the-shelf hardware to perform the required
computations, and a low-cost countermeasure to
counteract it.",
acknowledgement = ack-nhfb,
articleno = "8",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Lao:2016:BFD,
author = "Yingjie Lao and Qianying Tang and Chris H. Kim and
Keshab K. Parhi",
title = "Beat Frequency Detector-Based High-Speed True Random
Number Generators: Statistical Modeling and Analysis",
journal = j-JETC,
volume = "13",
number = "1",
pages = "9:1--9:??",
month = dec,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2866574",
ISSN = "1550-4832",
bibdate = "Sat Apr 8 10:16:07 MDT 2017",
bibsource = "http://www.acm.org/pubs/contents/journals/jetc/;
https://www.math.utah.edu/pub/tex/bib/jetc.bib;
https://www.math.utah.edu/pub/tex/bib/prng.bib",
abstract = "True random number generators (TRNGs) are crucial
components for the security of cryptographic systems.
In contrast to pseudo--random number generators
(PRNGs), TRNGs provide higher security by extracting
randomness from physical phenomena. To evaluate a TRNG,
statistical properties of the circuit model and raw
bitstream should be studied. In this article, a model
for the beat frequency detector--based high-speed TRNG
(BFD-TRNG) is proposed. The parameters of the model are
extracted from the experimental data of a test chip. A
statistical analysis of the proposed model is carried
out to derive mean and variance of the counter values
of the TRNG. Our statistical analysis results show that
mean of the counter values is inversely proportional to
the frequency difference of the two ring oscillators
(ROSCs), whereas the dynamic range of the counter
values increases linearly with standard deviation of
environmental noise and decreases with increase of the
frequency difference. Without the measurements from the
test data, a model cannot be created; similarly,
without a model, performance of a TRNG cannot be
predicted. The key contribution of the proposed
approach lies in fitting the model to measured data and
the ability to use the model to predict performance of
BFD-TRNGs that have not been fabricated. Several novel
alternate BFD-TRNG architectures are also proposed;
these include parallel BFD, cascade BFD, and
parallel-cascade BFD. These TRNGs are analyzed using
the proposed model, and it is shown that the parallel
BFD structure requires less area per bit, whereas the
cascade BFD structure has a larger dynamic range while
maintaining the same mean of the counter values as the
original BFD-TRNG. It is shown that 3.25 M and 4 M
random bits can be obtained per counter value from
parallel BFD and parallel-cascade BFD, respectively,
where M counter values are computed in parallel.
Furthermore, the statistical analysis results
illustrate that BFD-TRNGs have better randomness and
less cost per bit than other existing ROSC-TRNG
designs. For example, it is shown that BFD-TRNGs
accumulate 150\% more jitter than the original
two-oscillator TRNG and that parallel BFD-TRNGs require
one-third power and one-half area for same number of
random bits for a specified period.",
acknowledgement = ack-nhfb,
articleno = "9",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Kulkarni:2016:RTA,
author = "Amey Kulkarni and Youngok Pino and Matthew French and
Tinoosh Mohsenin",
title = "Real-Time Anomaly Detection Framework for Many-Core
Router through Machine-Learning Techniques",
journal = j-JETC,
volume = "13",
number = "1",
pages = "10:1--10:??",
month = dec,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2827699",
ISSN = "1550-4832",
bibdate = "Sat Apr 8 10:16:07 MDT 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "In this article, we propose a real-time anomaly
detection framework for an NoC-based many-core
architecture. We assume that processing cores and
memories are safe and anomaly is included through a
communication medium (i.e., router). The article
targets three different attacks, namely, traffic
diversion, route looping, and core address spoofing
attacks. The attacks are detected by using
machine-learning techniques. Comprehensive analysis on
machine-learning algorithms suggests that Support
Vector Machine (SVM) and K-Nearest Neighbor (K-NN) have
better attack detection efficiency. It has been
observed that both algorithms have accuracy in the
range of 94\% to 97\%. Additional hardware complexity
analysis advocates SVM to be implemented on hardware.
To test the framework, we implement a condition-based
attack insertion module; attacks are performed intra-
and intercluster. The proposed real-time anomaly
detection framework is fully placed and routed on
Xilinx Virtex-7 FPGA. Postplace and -route
implementation results show that SVM has 12\% to 2\%
area overhead and 3\% to 1\% power overhead for the
quad-core and 16-core implementation, respectively. It
is also observed that it takes 25\% to 18\% of the
total execution time to detect an anomaly in
transferred packets for quad-core and 16-core,
respectively. The proposed framework achieves 65\%
reduction in area overhead and is 3 times faster
compared to previous published work.",
acknowledgement = ack-nhfb,
articleno = "10",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Deb:2016:GVS,
author = "Arighna Deb and Robert Wille and Oliver Kesz{\"o}cze
and Stefan Hillmich and Rolf Drechsler",
title = "Gates vs. Splitters: Contradictory Optimization
Objectives in the Synthesis of Optical Circuits",
journal = j-JETC,
volume = "13",
number = "1",
pages = "11:1--11:??",
month = dec,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2904445",
ISSN = "1550-4832",
bibdate = "Sat Apr 8 10:16:07 MDT 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Optical circuits are considered a promising emerging
technology for applications in ultra-high-speed
networks or interconnects. However, the development of
(automatic) synthesis approaches for such circuits is
still in its infancy. Although first generic and
automatic synthesis approaches have been proposed, no
clear understanding exists yet on how to keep the costs
of the resulting circuits as small as possible. In the
domain of optical circuits, this is particularly
interesting for the number of gates and the effect of
so-called splitters to the signal strength. In this
work, we investigate this relation by considering a
variety of (existing as well as proposed) synthesis
approaches for optical circuits. Our investigations
show that reducing the number of gates and reducing the
number of splitters are contradictory optimization
objectives. Furthermore, the performance of synthesis
guided with respect to gate efficiency as well as
synthesis guided with respect to splitter freeness is
evaluated and an overhead factor between the
contradictory metrics is experimentally determined.",
acknowledgement = ack-nhfb,
articleno = "11",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Todri-Sanial:2017:GES,
author = "Aida Todri-Sanial and Saraju P. Mohanty and Mariane
Comte and Marc Belleville",
title = "Guest Editorial: Special Issue on Nanoelectronic
Circuit and System Design Methods for the Mobile
Computing Era",
journal = j-JETC,
volume = "13",
number = "2",
pages = "12:1--12:??",
month = mar,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3003370",
ISSN = "1550-4832",
bibdate = "Sat Apr 8 10:16:07 MDT 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
acknowledgement = ack-nhfb,
articleno = "12",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Sartor:2017:EIH,
author = "Anderson L. Sartor and Arthur F. Lorenzon and Luigi
Carro and Fernanda Kastensmidt and Stephan Wong and
Antonio C. S. Beck",
title = "Exploiting Idle Hardware to Provide Low Overhead Fault
Tolerance for {VLIW} Processors",
journal = j-JETC,
volume = "13",
number = "2",
pages = "13:1--13:??",
month = mar,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3001935",
ISSN = "1550-4832",
bibdate = "Sat Apr 8 10:16:07 MDT 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Because of technology scaling, the soft error rate has
been increasing in digital circuits, which affects
system reliability. Therefore, modern processors,
including VLIW architectures, must have means to
mitigate such effects to guarantee reliable computing.
In this scenario, our work proposes three low overhead
fault tolerance approaches based on instruction
duplication with zero latency detection, which uses a
rollback mechanism to correct soft errors in the
pipelanes of a configurable VLIW processor. The first
uses idle issue slots within a period of time to
execute extra instructions considering distinct
application phases. The second works at a finer grain,
adaptively exploiting idle functional units at
run-time. However, some applications present high
instruction-level parallelism (ILP), so the ability to
provide fault tolerance is reduced: less functional
units will be idle, decreasing the number of potential
duplicated instructions. The third approach attacks
this issue by dynamically reducing ILP according to a
configurable threshold, increasing fault tolerance at
the cost of performance. While the first two approaches
achieve significant fault coverage with minimal area
and power overhead for applications with low ILP, the
latter improves fault tolerance with low performance
degradation. All approaches are evaluated considering
area, performance, power dissipation, and error
coverage.",
acknowledgement = ack-nhfb,
articleno = "13",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Fang:2017:SPM,
author = "Yan Fang and Victor V. Yashin and Brandon B. Jennings
and Donald M. Chiarulli and Steven P. Levitan",
title = "A Simplified Phase Model for Simulation of
Oscillator-Based Computing Systems",
journal = j-JETC,
volume = "13",
number = "2",
pages = "14:1--14:??",
month = mar,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/2976743",
ISSN = "1550-4832",
bibdate = "Sat Apr 8 10:16:07 MDT 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Building oscillator-based computing systems with
emerging nano-device technologies has become a
promising solution for unconventional computing tasks
like computer vision and pattern recognition. However,
simulation and analysis of these computing systems is
both time and compute intensive due to the nonlinearity
of new devices and the complex behavior of coupled
oscillators. In order to speed up the simulation of
coupled oscillator systems, we propose a simplified
phase model to perform phase and frequency
synchronization prediction based on a synthesis of
earlier models. Our model can predict the
frequency-locking behavior with several orders of
magnitude speedup compared to direct evaluation,
enabling the effective and efficient simulation of the
large numbers of oscillators required for practical
computing systems. We demonstrate the oscillator-based
computing paradigm with three applications, pattern
matching, convolution, and image segmentation. The
simulation with these models are respectively sped up
by factors of 780, 300, and 1120 in our tests.",
acknowledgement = ack-nhfb,
articleno = "14",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Singhvi:2017:FGU,
author = "Ajay Singhvi and Matheus T. Moreira and Ramy N. Tadros
and Ney L. V. Calazans and Peter A. Beerel",
title = "A Fine-Grain, Uniform, Energy-Efficient Delay Element
for $2$-Phase Bundled-Data Circuits",
journal = j-JETC,
volume = "13",
number = "2",
pages = "15:1--15:??",
month = mar,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/2948067",
ISSN = "1550-4832",
bibdate = "Sat Apr 8 10:16:07 MDT 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Contemporary digitally controlled delay elements (DEs)
trade off power overheads and delay quantization error
(DQE). This article proposes a new programmable DE that
provides a balanced design that yields low power with
moderate DQE even under process, voltage, and
temperature variations. The element employs and
leverages the advantages offered by a 28nm fully
depleted silicon on insulator technology, using back
body biasing to add an extra dimension to its
programmability. To do so, a novel generic delay shift
block is proposed, which enables incorporating both
fine and coarse delays in a single DE that can be
easily integrated into digital systems, which is an
advantage over hybrid DEs that rely on analog design.",
acknowledgement = ack-nhfb,
articleno = "15",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Mohammadi:2017:FTR,
author = "Hassan Ghasemzadeh Mohammadi and Pierre-Emmanuel
Gaillardon and Jian Zhang and Giovanni {De Micheli} and
Ernesto Sanchez and Matteo Sonza Reorda",
title = "A Fault-Tolerant Ripple-Carry Adder with
Controllable-Polarity Transistors",
journal = j-JETC,
volume = "13",
number = "2",
pages = "16:1--16:??",
month = mar,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/2988234",
ISSN = "1550-4832",
bibdate = "Sat Apr 8 10:16:07 MDT 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "This article first explores the effects of faults on
circuits implemented with controllable-polarity
transistors. We propose a new fault model that suits
the characteristics of these devices, and we report the
results of a SPICE-based analysis of the effects of
faults on the behavior of some basic gates implemented
with them. Hence, we show that the considered devices
are able to intrinsically tolerate a rather high number
of faults. We finally exploit this property to build a
robust and scalable adder whose area, performance, and
leakage power characteristics are improved by 15\%,
18\%, and 12\%;, respectively, when compared to an
equivalent FinFET solution at 22nm technology node.",
acknowledgement = ack-nhfb,
articleno = "16",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Senni:2017:NVP,
author = "Sophiane Senni and Lionel Torres and Gilles Sassatelli
and Abdoulaye Gamatie",
title = "Non-Volatile Processor Based on {MRAM} for
Ultra-Low-Power {IoT} Devices",
journal = j-JETC,
volume = "13",
number = "2",
pages = "17:1--17:??",
month = mar,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3001936",
ISSN = "1550-4832",
bibdate = "Sat Apr 8 10:16:07 MDT 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Over the past few years, a new era of smart connected
devices has emerged in the market to enable the future
world of the Internet of Things (IoT). A key
requirement for IoT applications is the power
consumption to allow very high autonomy in the case of
battery-powered systems. Depending on the application,
such devices will be most of the time in a low-power
mode (sleep mode) and will wake up only when there is a
task to accomplish (active mode). Emerging non-volatile
memory technologies are seen as a very attractive
solution to design ultra-low-power systems. Among these
technologies, magnetic random access memory is a
promising candidate, as it combines non-volatility,
high density, reasonable latency, and low leakage.
Integration of non-volatility as a new feature of
memories has the great potential to allow full data
retention after a complete shutdown with a fast wake-up
time. This article explores the benefits of having a
non-volatile processor to enable ultra-low-power IoT
devices.",
acknowledgement = ack-nhfb,
articleno = "17",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Rakshit:2017:MTS,
author = "Joydeep Rakshit and Kartik Mohanram and Runlai Wan and
Kai Tak Lam and Jing Guo",
title = "Monolayer Transistor {SRAMs}: Toward Low-Power, Denser
Memory Systems",
journal = j-JETC,
volume = "13",
number = "2",
pages = "18:1--18:??",
month = mar,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/2967613",
ISSN = "1550-4832",
bibdate = "Sat Apr 8 10:16:07 MDT 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Monolayer heterojunction FETs based on vertical
heterogeneous transition metal dichalcogenides
(TMDCFETs) and planar black phosphorus FETs (BPFETs)
have demonstrated excellent subthreshold swing, high
I$_{ON}$ I$_{OFF}$, and high scalability, making them
attractive candidates for post-CMOS memory design. This
article explores TMDCFET and BPFET SRAM design by
combining atomistic self-consistent device modeling
with SRAM circuit design and simulation. We perform
detailed evaluations of the TMDCFET/BPFET SRAMs at a
single bitcell and at SRAM array level. Our simulations
show that at low operating voltages, TMDCFET/BPFET
SRAMs exhibit significant advantages in static power,
dynamic read/write noise margin, and read/write delay
over nominal 16nm CMOS SRAMs at both bitcell and
array-level implementations. We also analyze the effect
of process variations on the performance of
TMDCFET/BPFET SRAMs. Our simulations demonstrate that
TMDCFET/BPFET SRAMs exhibit high tolerance to process
variations, which is desirable for low operating
voltages.",
acknowledgement = ack-nhfb,
articleno = "18",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Wang:2017:ACP,
author = "Xuan Wang and Jiang Xu and Zhe Wang and Haoran Li and
Zhehui Wang and Peng Yang and Luan H. K. Duong and
Rafael K. V. Maeda and Zhifei Wang",
title = "Alleviate Chip Pin Constraint for Multicore Processor
by On\slash Off-Chip Power Delivery System Codesign",
journal = j-JETC,
volume = "13",
number = "2",
pages = "19:1--19:??",
month = mar,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/2914791",
ISSN = "1550-4832",
bibdate = "Sat Apr 8 10:16:07 MDT 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "The number of chip pins is limited due to the cost and
reliability issues of sophisticated packages, and it is
predicted that the chip pin count will be overstretched
to satisfy the requirements of both power delivery and
memory access. The gap between the achievable pin count
and the demand will increase as the technology scales,
due to the increasing computation resources and supply
current. Pin reduction techniques are thus required for
continued computing performance growth. In this
article, we propose a chip pin constraint alleviation
strategy, through on/off-chip power delivery system
co-design, to effectively reduce the demand for power
pins. An analytical model of a power delivery system,
consisting of on/off-chip regulators and a power
delivery network, is proposed to evaluate the influence
of regulator design and package conduction loss. By
combining this model with a multi-core processor model
of performance and memory bandwidth requirements, we
characterize the entire multi-core processor system to
investigate the relationship between the chip pin
constraint and performance in multi-core processor
scaling and the effectiveness of our strategy.
Experiments show that with the conventional power
delivery system design, the chip pin constraint
severely limits the performance growth as the
technology scales. Using the on/off-chip power delivery
system co-design, our strategy achieves a significant
pin count reduction, for example, 31.3\% at the 8nm
technology node, compared to the conventional design
with the same chip performance, while, provided with
the same chip pin count, it is able to improve, by
35.0\%, the chip performance at 8nm compared to the
conventional design. For real applications of different
parallelism, our strategy outperforms its counterpart,
with a 23.7\% performance improvement on average at the
8nm technology node.",
acknowledgement = ack-nhfb,
articleno = "19",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Pajouhi:2017:YAE,
author = "Zoha Pajouhi and Xuanyao Fong and Anand Raghunathan
and Kaushik Roy",
title = "Yield, Area, and Energy Optimization in {STT--MRAMs}
Using Failure-Aware {ECC}",
journal = j-JETC,
volume = "13",
number = "2",
pages = "20:1--20:??",
month = mar,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/2934685",
ISSN = "1550-4832",
bibdate = "Sat Apr 8 10:16:07 MDT 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Spin-Transfer Torque MRAMs are attractive due to their
non-volatility, high density, and zero leakage.
However, STT-MRAMs suffer from poor reliability due to
shared read and write paths. Additionally, conflicting
requirements for data retention and writeability (both
related to the energy barrier height of the storage
device) makes design more challenging. Furthermore, the
energy barrier height depends on the geometry of the
storage. Any variations in the geometry of the storage
device lead to variations in the energy barrier height.
In order to address the poor reliability of STT-MRAMs,
usage of Error Correcting Codes (ECC) has been
proposed. Unlike traditional CMOS memory technologies,
ECC is expected to correct both soft and hard errors in
STT-MRAMs. To achieve acceptable yield with low write
power, stronger ECC is required, resulting in increased
number of encoded bits and degraded memory capacity. In
this article, we propose Failure-aware ECC (FaECC),
which masks permanent faults while maintaining the same
correction capability for soft errors without increased
number of encoded bits. Furthermore, we investigate the
impact of process variations on run-time reliability of
STT-MRAMs. In order to analyze the effectiveness of our
methodology, we developed a cross-layer simulation
framework that consists of device, circuit and array
level analysis of STT-MRAM memory arrays. Our results
show that using FaECC relaxes the requirements on the
energy barrier height, which reduces the write energy
and results in smaller access transistor size and
memory array area.",
acknowledgement = ack-nhfb,
articleno = "20",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Mankalale:2017:OSC,
author = "Meghna G. Mankalale and Sachin S. Sapatnekar",
title = "Optimized Standard Cells for All-Spin Logic",
journal = j-JETC,
volume = "13",
number = "2",
pages = "21:1--21:??",
month = mar,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/2967612",
ISSN = "1550-4832",
bibdate = "Sat Apr 8 10:16:07 MDT 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "All-Spin Logic (ASL) devices provide a promising
spintronics-based alternative for Boolean logic
implementations in the post-Complementary Metal-Oxide
Semiconductor (CMOS) era. In principle, any logic
functionality can be implemented in ASL. In practice,
the performance of an ASL gate is significantly
affected by layout choices, but such implications have
not been adequately explored in the past. This article
proposes a systematic approach for building standard
cells in ASL, which are a basic building block in an
overall design methodology for implementing large
ASL-based circuits. We first propose a new technique to
reduce the magnet count for an ASL majority gate but
still ensure correct functioning through layout
optimization methods. Building on physics-based
analysis, we then build a standard cell library with
diverse functionality and characterize the library for
delay, energy, and area. We perform delay-optimized
technology mapping on ISCAS85 benchmark circuits using
our library. Our approach results in circuits that are
12.90\% faster, consume 26.16\% less energy, and are
33.56\% more area efficient compared to a standard cell
library that does not incorporate layout-based
optimization techniques of our work.",
acknowledgement = ack-nhfb,
articleno = "21",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Jiang:2017:SLD,
author = "Wei Jiang and Liang Wen and Ke Jiang and Xia Zhang and
Xiong Pan and Keran Zhou",
title = "System-Level Design to Detect Fault Injection Attacks
on Embedded Real-Time Applications",
journal = j-JETC,
volume = "13",
number = "2",
pages = "22:1--22:??",
month = mar,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/2967611",
ISSN = "1550-4832",
bibdate = "Sat Apr 8 10:16:07 MDT 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Fault injection attack has been a serious threat to
security-critical embedded systems for a long time, yet
existing research ignores addressing of the problem
from a system-level perspective. This article presents
an approach to the synthesis of secure real-time
applications mapped on distributed embedded systems,
which focuses on preventing fault injection attacks of
the security protection on processing units. We utilize
symmetric cryptographic service to protect
confidentiality and deploy fault detection within a
confidential algorithm to resist fault injection
attacks. Several fault detection schemes are
identified, and their fault coverage rates and time
overheads are derived and measured. Our synthesis
approach makes efforts to determine the best fault
detection schemes for the encryption/decryption of
messages such that the overall security strength of
detecting a fault injection attack is maximized and the
deadline constraint of the real-time applications is
guaranteed. Due to the complexity of the problem, we
propose an efficient algorithm based on the fruit fly
optimization algorithm, and we compare it to the
simulated annealing approach. Extensive experiments and
a real-life application evaluation demonstrate the
superiority of our approach.",
acknowledgement = ack-nhfb,
articleno = "22",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Goud:2017:AUF,
author = "A. Arun Goud and Rangharajan Venkatesan and Anand
Raghunathan and Kaushik Roy",
title = "Asymmetric Underlapped {FinFETs} for Near- and
Super-Threshold Logic at Sub-10nm Technology Nodes",
journal = j-JETC,
volume = "13",
number = "2",
pages = "23:1--23:??",
month = mar,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/2967615",
ISSN = "1550-4832",
bibdate = "Sat Apr 8 10:16:07 MDT 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Extending double-gate FinFET scaling to sub-10nm
technology regime requires device-engineering
techniques for countering the rise of direct source to
drain tunneling (DSDT), edge direct tunneling (EDT) and
short channel effects (SCE) that degrade FinFET I-V
characteristics. Symmetric underlap is effective for
eliminating EDT, diminishing DSDT, and lowering the
fringe component of gate capacitance. However,
excessive symmetric underlap also lowers the
on-current, which is mainly due to thermionic emission.
In this work, it is demonstrated that at sub-10nm node,
asymmetric underlapped FinFETs with slightly longer
underlap toward drain side than source side are
superior to symmetric underlapped FinFETs due to
further improvement in I$_{on}$ /I$_{off}$ and
reduction in gate-to-drain capacitance. Using quantum
mechanical device simulations, FinFETs with various
degrees of underlap have been analyzed for improvement
in I-V characteristics. A FinFET model for circuit
simulations has been constructed that captures the
major sub-10nm leakage components, namely, thermionic
emission, DSDT, EDT, direct gate oxide tunneling and
its associated components. By simulating a 10-stage
NAND circuit and a LEON3 processor with interconnect
parasitics using these devices, it is shown that
asymmetric underlap instead of symmetric underlap in
sub-10nm FinFETs can offer lower energy consumption
with improved performance for near-threshold logic and
higher energy-efficiency for super-threshold logic
operation.",
acknowledgement = ack-nhfb,
articleno = "23",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Abellan:2017:EPN,
author = "Jos{\'e} L. Abell{\'a}n and Chao Chen and Ajay Joshi",
title = "Electro-Photonic {NoC} Designs for Kilocore Systems",
journal = j-JETC,
volume = "13",
number = "2",
pages = "24:1--24:??",
month = mar,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/2967614",
ISSN = "1550-4832",
bibdate = "Sat Apr 8 10:16:07 MDT 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "The increasing core count in manycore systems requires
a corresponding large Network-on-chip (NoC) bandwidth
to support the overlying applications. However, it is
not possible to provide this large bandwidth in an
energy-efficient manner using electrical link
technology. To overcome this issue, photonic link
technology has been proposed as a replacement. This
work explores the limits and opportunities for using
photonic links to design the NoC architecture for a
future Kilocore system. Three different NoC designs are
explored: ElecNoC, an electrical concentrated
two-dimensional- (2D) mesh NoC; HybNoC, an electrical
concentrated 2D mesh with a photonic multi-crossbar
NoC; and PhotoNoC, a photonic multi-bus NoC. We
consider both private and shared cache architectures
and, to leverage the large bandwidth density of
photonic links, we investigate the use of prefetching
and aggressive non-blocking caches. Our analysis using
contemporary Big Data workloads shows that the
non-blocking caches with a shared LLC can best leverage
the large bandwidth of the photonic links in the
Kilocore system. Moreover, compared to ElecNoC-based
and HybNoC-based Kilocore systems, a PhotoNoC-based
Kilocore system achieves up to 2.5$ \times $ and 1.5$
\times $ better performance, respectively, and can
support up to 2.1$ \times $ and 1.1$ \times $ higher
bandwidth, respectively, while dissipating comparable
power in the overall system.",
acknowledgement = ack-nhfb,
articleno = "24",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Wang:2017:OSS,
author = "Yao Wang and Liang Rong and Haibo Wang and Guangjun
Wen",
title = "One-Step Sneak-Path Free Read Scheme for Resistive
Crossbar Memory",
journal = j-JETC,
volume = "13",
number = "2",
pages = "25:1--25:??",
month = mar,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3012002",
ISSN = "1550-4832",
bibdate = "Sat Apr 8 10:16:07 MDT 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "A one-step sneak-path free read scheme for resistive
crossbar memory is proposed in this article. During
read operation, it configures the crossbar array into a
four-terminal resistance network, which is composed of
the selected cell and three other resistors
corresponding to unselected cells that contribute to
the sneak-path. Two sensing voltages with equal
potential are applied to three terminals of the
network. One is for sensing the resistance of the
selected cell; the other is for creating zero-voltage
drop across one of the three resistors, which connects
the sneak-path to the selected cell. This effectively
suppresses the current injected by the sneak-path to
the selected cell-sensing loop. This work also proposes
a cost-effective data-encoding circuit that guarantees
that at least half of the memory cells are in a
high-resistance state, which further minimizes
sneak-path current. The impact of key design
parameters, such as sensing voltage, switch
on-resistance, and the ratio of memory cell resistances
in different states, as well as nonideal effects are
investigated. Equations for estimating the maximum
array size to share a single read circuit are derived.
The effectiveness of the proposed design has been
validated via circuit simulations. Impacts of the
word-/bit-line resistance are also analyzed.",
acknowledgement = ack-nhfb,
articleno = "25",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Guler:2017:ULL,
author = "Abdullah Guler and Niraj K. Jha",
title = "Ultra-low-leakage, Robust {FinFET SRAM} Design Using
Multiparameter Asymmetric {FinFETs}",
journal = j-JETC,
volume = "13",
number = "2",
pages = "26:1--26:??",
month = mar,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/2988233",
ISSN = "1550-4832",
bibdate = "Sat Apr 8 10:16:07 MDT 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Memory arrays consisting of Static Random Access
Memory (SRAM) cells occupy the largest area on chip and
are responsible for significant leakage power
consumption in modern microprocessors. With the
transition from planar Complementary
Metal-Oxide-Semiconductor (CMOS) technology to FinFETs,
FinFET SRAM design has become important. However,
increasing leakage power consumption of FinFETs due to
aggressive scaling, width quantization, read-write
conflict, and process variations make FinFET SRAM
design challenging. In this article, we show how
Multiparameter Asymmetric (MPA) FinFETs can be used to
design ultra-low-leakage and robust 6T SRAM cells. We
combine multiple asymmetries, namely, asymmetry in gate
work function, source/drain doping concentration, and
gate underlap, to address various SRAM design issues
all at once. We propose five novel MPA FinFET SRAM cell
designs and compare them with symmetric and
Single-Parameter Asymmetric (SPA) FinFET SRAM cells
using dc and transient metrics. We show that the
leakage current of MPA FinFET SRAM cells can be reduced
by up to 58 $ \times $ while ensuring reasonable
read/write stability metric values. In addition, high
stability metric values can be achieved with 22 $
\times $ leakage current reduction compared to the
traditional symmetric FinFET SRAM cell. There is no
area overhead associated with MPA FinFET SRAM cells.",
acknowledgement = ack-nhfb,
articleno = "26",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Zhang:2017:SSR,
author = "Hang Zhang and Xuhao Chen and Nong Xiao and Lei Wang
and Fang Liu and Wei Chen and Zhiguang Chen",
title = "Shielding {STT--RAM} Based Register Files on {GPUs}
against Read Disturbance",
journal = j-JETC,
volume = "13",
number = "2",
pages = "27:1--27:??",
month = mar,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/2996191",
ISSN = "1550-4832",
bibdate = "Sat Apr 8 10:16:07 MDT 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "To address the high energy consumption issue of SRAM
on GPUs, emerging Spin-Transfer Torque (STT-RAM) memory
technology has been intensively studied to build GPU
register files for better energy-efficiency, thanks to
its benefits of low leakage power, high density, and
good scalability. However, STT-RAM suffers from the
read disturbance issue, which stems from the fact that
the voltage difference between read current and write
current becomes smaller as technology scales. The read
disturbance leads to high error rates for read
operations, which cannot be effectively protected by
the SEC-DED ECC on large-capacity register files of
GPUs. Prior schemes (e.g., read-restore) to mitigate
the read disturbance usually incur either non-trivial
performance loss or excessive energy overhead, thus not
applicable for the GPU register file design that aims
to achieve both high performance and energy-efficiency.
To combat the read disturbance, we propose a novel
software-hardware co-designed solution (i.e.,
Red-Shield ), which consists of three optimizations to
overcome the limitations of the existing solutions.
First, we identify dead reads at compiling stage and
augment instructions to avoid unnecessary restores.
Second, we employ a small read buffer to accommodate
register reads with high-access locality to further
reduce restores. Third, we propose an adaptive restore
mechanism to selectively pick the suitable restore
scheme, according to the busy status of corresponding
register banks. Experimental results show that our
proposed design can effectively mitigate the
performance loss and energy overhead caused by restore
operations while still maintaining the reliability of
reads.",
acknowledgement = ack-nhfb,
articleno = "27",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Biswas:2017:SAT,
author = "Arnab Kumar Biswas",
title = "Source Authentication Techniques for Network-on-Chip
Router Configuration Packets",
journal = j-JETC,
volume = "13",
number = "2",
pages = "28:1--28:??",
month = mar,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/2996194",
ISSN = "1550-4832",
bibdate = "Sat Apr 8 10:16:07 MDT 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "It is known that maliciously configured
Network-on-Chip routers can enable an attacker to
launch different attacks inside a Multiprocessor
System-on-Chip. A source authentication mechanism for
router configuration packets can prevent such
vulnerability. This ensures that a router is configured
by the configuration packets sent only by a trusted
configuration source. Conventional method like Secure
Hash Algorithm-3 (SHA-3) can provide required source
authentication in a router but with a router area
overhead of 1355.25\% compared to a normal router area.
We propose eight source authentication mechanisms that
can achieve similar level of security as SHA-3 for a
router configuration perspective without causing
significant area and power increase. Moreover, the
processing time of our proposed techniques is 1/100th
of SHA-3 implementation. Most of our proposed
techniques use different timing channel watermarking
methods to transfer source authentication data to the
receiver router. We also propose the Individual
packet-based stream authentication technique and
combinations of this technique with timing channel
watermarking techniques. It is shown that, among all of
our proposed techniques, maximum router area increment
required is 28.32\% compared to a normal router.",
acknowledgement = ack-nhfb,
articleno = "28",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Mittal:2017:STA,
author = "Sparsh Mittal",
title = "A Survey of Techniques for Architecting Processor
Components Using Domain-Wall Memory",
journal = j-JETC,
volume = "13",
number = "2",
pages = "29:1--29:??",
month = mar,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/2994550",
ISSN = "1550-4832",
bibdate = "Sat Apr 8 10:16:07 MDT 2017",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Recent trends of increasing core-count and
bandwidth/memory wall have motivated researchers to
explore novel memory technologies for designing
processor components such as cache, register file,
shared memory, and so on. Domain-wall memory (DWM),
also known as racetrack memory, is a promising emerging
technology due to its non-volatility and very high
density. However, use of DWM presents challenges due to
characteristics of both DWM itself (e.g., requirement
of shift operations, variable latency) and processor
components. Recently, several techniques have been
proposed to address these challenges. This article
presents a survey of architectural techniques for using
DWM for designing components in both CPU and GPU. We
discuss techniques related to performance, energy, and
reliability and also discuss works that compare DWM
with other memory technologies. We also highlight the
opportunities and obstacles in using DWM for designing
processor components. This survey is expected to spark
further research in this area and be useful for
researchers, chip designers, and computer architects.",
acknowledgement = ack-nhfb,
articleno = "29",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Cao:2017:GEI,
author = "Yu Cao and Xin Li and Taemin Kim and Suyog Gupta",
title = "Guest Editors' Introduction: Hardware and Algorithms
for On-Chip Learning",
journal = j-JETC,
volume = "13",
number = "3",
pages = "30:1--30:??",
month = may,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3022193",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Tue Jul 11 17:10:31 MDT 2017",
bibsource = "http://www.acm.org/pubs/contents/journals/jetc/;
https://www.math.utah.edu/pub/tex/bib/jetc.bib",
acknowledgement = ack-nhfb,
articleno = "30",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Page:2017:SHA,
author = "Adam Page and Ali Jafari and Colin Shea and Tinoosh
Mohsenin",
title = "{SPARCNet}: a Hardware Accelerator for Efficient
Deployment of Sparse Convolutional Networks",
journal = j-JETC,
volume = "13",
number = "3",
pages = "31:1--31:??",
month = may,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3005448",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Tue Jul 11 17:10:31 MDT 2017",
bibsource = "http://www.acm.org/pubs/contents/journals/jetc/;
https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Deep neural networks have been shown to outperform
prior state-of-the-art solutions that often relied
heavily on hand-engineered feature extraction
techniques coupled with simple classification
algorithms. In particular, deep convolutional neural
networks have been shown to dominate on several popular
public benchmarks such as the ImageNet database.
Unfortunately, the benefits of deep networks have yet
to be fully exploited in embedded, resource-bound
settings that have strict power and area budgets.
Graphical processing unit (GPU) have been shown to
improve throughput and energy-efficiency over central
processing unit (CPU) due to their highly parallel
architecture yet still impose a significant power
burden. In a similar fashion, field programmable gate
array (FPGA) can be used to improve performance while
further allowing more fine-grained control over
implementation to improve efficiency. In order to
reduce power and area while still achieving required
throughput, classification-efficient network
architectures are required in addition to optimal
deployment on efficient hardware. In this work, we
target both of these enterprises. For the first
objective, we analyze simple, biologically inspired
reduction strategies that are applied both before and
after training. The central theme of the techniques is
the introduction of sparsification to help dissolve
away the dense connectivity that is often found at
different levels in convolutional neural networks. The
sparsification techniques include feature compression
partition, structured filter pruning, and dynamic
feature pruning. Additionally, we explore filter
factorization and filter quantization approximation
techniques to further reduce the complexity of
convolutional layers. In the second contribution, we
propose SPARCNet, a hardware accelerator for efficient
deployment of SPAR se C onvolutional NET works. The
accelerator looks to enable deploying networks in such
resource-bound settings by both exploiting efficient
forms of parallelism inherent in convolutional layers
and by exploiting the sparsification and approximation
techniques proposed. To demonstrate both contributions,
modern deep convolutional network architectures
containing millions of parameters are explored within
the context of the computer vision dataset CIFAR.
Utilizing the reduction techniques, we demonstrate the
ability to reduce computation and memory by 60\% and
93\% with less than 0.03\% impact on accuracy when
compared to the best baseline network with 93.47\%
accuracy. The SPARCNet accelerator with different
numbers of processing engines is implemented on a
low-power Artix-7 FPGA platform. Additionally, the same
networks are optimally implemented on a number of
embedded commercial-off-the-shelf platforms including
NVIDIAs CPU+GPU SoCs TK1 and TX1 and Intel Edison.
Compared to NVIDIAs TK1 and TX1, the FPGA-based
accelerator obtains 11.8 $ \times $ and 7.5 $ \times $
improvement in energy efficiency while maintaining a
classification throughput of 72 images/s. When further
compared to a number of recent FPGA-based accelerators,
SPARCNet is able to achieve up to 15 $ \times $
improvement in energy efficiency while consuming less
than 2W of total board power at 100MHz. In addition to
improving efficiency, the accelerator has built-in
support for sparsification techniques and ability to
perform in-place rectified linear unit (ReLU)
activation function, max-pooling, and batch
normalization.",
acknowledgement = ack-nhfb,
articleno = "31",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Anwar:2017:SPD,
author = "Sajid Anwar and Kyuyeon Hwang and Wonyong Sung",
title = "Structured Pruning of Deep Convolutional Neural
Networks",
journal = j-JETC,
volume = "13",
number = "3",
pages = "32:1--32:??",
month = may,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3005348",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Tue Jul 11 17:10:31 MDT 2017",
bibsource = "http://www.acm.org/pubs/contents/journals/jetc/;
https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Real-time application of deep learning algorithms is
often hindered by high computational complexity and
frequent memory accesses. Network pruning is a
promising technique to solve this problem. However,
pruning usually results in irregular network
connections that not only demand extra representation
efforts but also do not fit well on parallel
computation. We introduce structured sparsity at
various scales for convolutional neural networks:
feature map-wise, kernel-wise, and intra-kernel strided
sparsity. This structured sparsity is very advantageous
for direct computational resource savings on embedded
computers, in parallel computing environments, and in
hardware-based systems. To decide the importance of
network connections and paths, the proposed method uses
a particle filtering approach. The importance weight of
each particle is assigned by assessing the
misclassification rate with a corresponding
connectivity pattern. The pruned network is retrained
to compensate for the losses due to pruning. While
implementing convolutions as matrix products, we
particularly show that intra-kernel strided sparsity
with a simple constraint can significantly reduce the
size of the kernel and feature map tensors. The
proposed work shows that when pruning granularities are
applied in combination, we can prune the CIFAR-10
network by more than 70\% with less than a 1\% loss in
accuracy.",
acknowledgement = ack-nhfb,
articleno = "32",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Panda:2017:EEI,
author = "Priyadarshini Panda and Abhronil Sengupta and Kaushik
Roy",
title = "Energy-Efficient and Improved Image Recognition with
Conditional Deep Learning",
journal = j-JETC,
volume = "13",
number = "3",
pages = "33:1--33:??",
month = may,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3007192",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Tue Jul 11 17:10:31 MDT 2017",
bibsource = "http://www.acm.org/pubs/contents/journals/jetc/;
https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Deep-learning neural networks have proven to be very
successful for a wide range of recognition tasks across
modern computing platforms. However, the computational
requirements associated with such deep nets can be
quite high, and hence their energy-efficient
implementation is of great interest. Although,
traditionally, the entire network is utilized for the
recognition of all inputs, we observe that the
classification difficulty varies widely across inputs
in real-world datasets; only a small fraction of inputs
requires the full computational effort of a network,
while a large majority can be classified correctly with
very low effort. In this article, we propose
Conditional Deep Learning (CDL), where the
convolutional layer features are used to identify the
variability in the difficulty of input instances and
conditionally activate the deeper layers of the
network. We achieve this by cascading a linear network
of output neurons for each convolutional layer and
monitoring the output of the linear network to decide
whether classification can be terminated at the current
stage or not. The proposed methodology thus enables the
network to dynamically adjust the computational effort
depending on the difficulty of the input data while
maintaining competitive classification accuracy. The
overall energy benefits for MNIST/CIFAR10/Tiny ImageNet
datasets with state-of-the-art deep-learning
architectures are $ 1.84 \times $ / $ 2.83 \times $ / $
4.02 \times $, respectively. We further employ the
conditional approach to train deep-learning networks
from scratch with integrated supervision from the
additional output neurons appended at the intermediate
convolutional layers. Our proposed integrated CDL
training leads to an improvement in the gradient
convergence behavior giving substantial error rate
reduction on MNIST/CIFAR-10, resulting in improved
classification over state-of-the-art baseline
networks.",
acknowledgement = ack-nhfb,
articleno = "33",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Karam:2017:MCR,
author = "Robert Karam and Somnath Paul and Ruchir Puri and
Swarup Bhunia",
title = "Memory-Centric Reconfigurable Accelerator for
Classification and Machine Learning Applications",
journal = j-JETC,
volume = "13",
number = "3",
pages = "34:1--34:??",
month = may,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/2997649",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Tue Jul 11 17:10:31 MDT 2017",
bibsource = "http://www.acm.org/pubs/contents/journals/jetc/;
https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Big Data refers to the growing challenge of turning
massive, often unstructured datasets into meaningful,
organized, and actionable data. As datasets grow from
petabytes to exabytes and beyond, it becomes
increasingly difficult to run advanced analytics,
especially Machine Learning (ML) applications, in a
reasonable time and on a practical power budget using
traditional architectures. Previous work has focused on
accelerating analytics readily implemented as SQL
queries on data-parallel platforms, generally using
off-the-shelf CPUs and General Purpose Graphics
Processing Units (GPGPUs) for computation or
acceleration. However, these systems are
general-purpose and still require a vast amount of data
transfer between the storage devices and computing
elements, thus limiting the system efficiency. As an
alternative, this article presents a reconfigurable
memory-centric advanced analytics accelerator that
operates at the last level of memory and dramatically
reduces energy required for data transfer. We
functionally validate the framework using an FPGA-based
hardware emulation platform and three representative
applications: Na{\"\i}ve Bayesian Classification,
Convolutional Neural Networks, and k-Means Clustering.
Results are compared with implementations on a modern
CPU and workstation GPGPU. Finally, the use of
in-memory dataset decompression to further reduce data
transfer volume is investigated. With these techniques,
the system achieves an average energy efficiency
improvement of 74$ \times $ and 212$ \times $ over GPU
and single-threaded CPU, respectively, while dataset
compression is shown to improve overall efficiency by
an additional 1.8$ \times $ on average.",
acknowledgement = ack-nhfb,
articleno = "34",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Yuan:2017:VAR,
author = "Bo Yuan and Keshab K. Parhi",
title = "{VLSI} Architectures for the {Restricted Boltzmann
Machine}",
journal = j-JETC,
volume = "13",
number = "3",
pages = "35:1--35:??",
month = may,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3007193",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Tue Jul 11 17:10:31 MDT 2017",
bibsource = "http://www.acm.org/pubs/contents/journals/jetc/;
https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Neural network (NN) systems are widely used in many
important applications ranging from computer vision to
speech recognition. To date, most NN systems are
processed by general processing units like CPUs or
GPUs. However, as the sizes of dataset and network
rapidly increase, the original software implementations
suffer from long training time. To overcome this
problem, specialized hardware accelerators are needed
to design high-speed NN systems. This article presents
an efficient hardware architecture of restricted
Boltzmann machine (RBM) that is an important category
of NN systems. Various optimization approaches at the
hardware level are performed to improve the training
speed. As-soon-as-possible and overlapped-scheduling
approaches are used to reduce the latency. It is shown
that, compared with the flat design, the proposed RBM
architecture can achieve 50\% reduction in training
time. In addition, an on-the-fly computation scheme is
also used to reduce the storage requirement of binary
and stochastic states by several hundreds of times.
Then, based on the proposed approach, a 784-2252 RBM
design example is developed for MNIST handwritten digit
recognition dataset. Analysis shows that the VLSI
design of RBM achieves significant improvement in
training speed and energy efficiency as compared to
CPU/GPU-based solution.",
acknowledgement = ack-nhfb,
articleno = "35",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Ni:2017:DMC,
author = "Leibin Ni and Hantao Huang and Zichuan Liu and Rajiv
V. Joshi and Hao Yu",
title = "Distributed In-Memory Computing on Binary {RRAM}
Crossbar",
journal = j-JETC,
volume = "13",
number = "3",
pages = "36:1--36:??",
month = may,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/2996192",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Tue Jul 11 17:10:31 MDT 2017",
bibsource = "http://www.acm.org/pubs/contents/journals/jetc/;
https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "The recently emerging resistive random-access memory
(RRAM) can provide nonvolatile memory storage but also
intrinsic computing for matrix-vector multiplication,
which is ideal for the low-power and high-throughput
data analytics accelerator performed in memory.
However, the existing RRAM crossbar--based computing is
mainly assumed as a multilevel analog computing, whose
result is sensitive to process nonuniformity as well as
additional overhead from AD-conversion and I/O. In this
article, we explore the matrix-vector multiplication
accelerator on a binary RRAM crossbar with adaptive
1-bit-comparator--based parallel conversion. Moreover,
a distributed in-memory computing architecture is also
developed with the according control protocol. Both
memory array and logic accelerator are implemented on
the binary RRAM crossbar, where the logic-memory pair
can be distributed with the control bus protocol.
Experimental results have shown that compared to the
analog RRAM crossbar, the proposed binary RRAM crossbar
can achieve significant area savings with better
calculation accuracy. Moreover, significant speedup can
be achieved for matrix-vector multiplication in neural
network--based machine learning such that the overall
training and testing time can be both reduced. In
addition, large energy savings can be also achieved
when compared to the traditional CMOS-based
out-of-memory computing architecture.",
acknowledgement = ack-nhfb,
articleno = "36",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Merkel:2017:SCB,
author = "Cory Merkel and Dhireesha Kudithipudi and Manan Suri
and Bryant Wysocki",
title = "Stochastic {CBRAM}-Based Neuromorphic Time Series
Prediction System",
journal = j-JETC,
volume = "13",
number = "3",
pages = "37:1--37:??",
month = may,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/2996193",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Tue Jul 11 17:10:31 MDT 2017",
bibsource = "http://www.acm.org/pubs/contents/journals/jetc/;
https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "In this research, we present a Conductive-Bridge RAM
(CBRAM)-based neuromorphic system which efficiently
addresses time series prediction. We propose a new (i)
voltage-mode, stochastic, multiweight synapse circuit
based on experimental bi-stable CBRAM devices, (ii) a
voltage-mode neuron circuit based on the concept of
charge sharing, and (iii) an optimized training
methodology powered by a stochastic implementation of
the Least-Mean-Squares (SLMS) training rule. To
validate the proposed design, we use time series
prediction for short-term electrical load forecasting
in smart grids. Our system is able to forecast hourly
electrical loads with a mean accuracy of 96\%, an
estimated power dissipation of 15 $ \mu $ W, and area
of 14.5 $ \mu m^2 $ at 65 nm CMOS technology.",
acknowledgement = ack-nhfb,
articleno = "37",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Topaloglu:2017:EJS,
author = "Rasit O. Topaloglu and Naveen Verma",
title = "Editorial for {JETC} Special Issue on Alternative
Computing Systems",
journal = j-JETC,
volume = "13",
number = "3",
pages = "38:1--38:??",
month = may,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3022700",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Tue Jul 11 17:10:31 MDT 2017",
bibsource = "http://www.acm.org/pubs/contents/journals/jetc/;
https://www.math.utah.edu/pub/tex/bib/jetc.bib",
acknowledgement = ack-nhfb,
articleno = "38",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Britt:2017:HPC,
author = "Keith A. Britt and Travis S. Humble",
title = "High-Performance Computing with Quantum Processing
Units",
journal = j-JETC,
volume = "13",
number = "3",
pages = "39:1--39:??",
month = may,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3007651",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Tue Jul 11 17:10:31 MDT 2017",
bibsource = "http://www.acm.org/pubs/contents/journals/jetc/;
https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "The prospects of quantum computing have driven efforts
to realize fully functional quantum processing units
(QPUs). Recent success in developing proof-of-principle
QPUs has prompted the question of how to integrate
these emerging processors into modern high-performance
computing (HPC) systems. We examine how QPUs can be
integrated into current and future HPC system
architectures by accounting for functional and physical
design requirements. We identify two integration
pathways that are differentiated by infrastructure
constraints on the QPU and the use cases expected for
the HPC system. This includes a tight integration that
assumes infrastructure bottlenecks can be overcome as
well as a loose integration that assumes they cannot.
We find that the performance of both approaches is
likely to depend on the quantum interconnect that
serves to entangle multiple QPUs. We also identify
several challenges in assessing QPU performance for
HPC, and we consider new metrics that capture the
interplay between system architecture and the quantum
parallelism underlying computational performance.",
acknowledgement = ack-nhfb,
articleno = "39",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Yoon:2017:MUM,
author = "Su-Kyung Yoon and Young-Sun Youn and Kihyun Park and
Shin-Dug Kim",
title = "Mobile Unified Memory-Storage Structure Based on
Hybrid Non-Volatile Memories",
journal = j-JETC,
volume = "13",
number = "3",
pages = "40:1--40:??",
month = may,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3007650",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Tue Jul 11 17:10:31 MDT 2017",
bibsource = "http://www.acm.org/pubs/contents/journals/jetc/;
https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "In mobile computing systems, the limited amount of
main memory space leads to page swap operation overhead
and data duplication in both main memory and secondary
storage. Furthermore, SQLite write operations in mobile
devices such as smartphones and tablet PCs tend to
frequently overwrite data to storage, significantly
degrading performance. Thus, this article presents a
unified memory-storage structure that is optimized for
mobile devices and blurs the boundary between the
existing main memory layer and secondary storage layer.
This structure can eliminate the conventional page-swap
operations that cause significant performance
degradation and support fast program execution time.
The unified memory-storage structure consists of a
dynamic RAM (DRAM) and phase change memory (PCM) -based
dual buffering module, a hybrid unified memory-storage
array consisting of DRAM and NAND Flash memory, and an
associated unified storage translation layer devised
for the memory address and file translation mechanism
as a system software module. This hybrid array of
non-volatile memories is formed as a single memory-disk
integrated storage space that can be logically divided
into static and dynamic spaces. Experimental results
show that the overall performance of the hybrid unified
memory-storage system with the buffering structure
increases by around 13\% and power consumption is also
improved by 35\%, compared to current mobile system.",
acknowledgement = ack-nhfb,
articleno = "40",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Guha:2017:RTS,
author = "Krishnendu Guha and Debasri Saha and Amlan
Chakrabarti",
title = "Real-Time {SoC} Security against Passive Threats Using
Crypsis Behavior of Geckos",
journal = j-JETC,
volume = "13",
number = "3",
pages = "41:1--41:??",
month = may,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3014166",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Tue Jul 11 17:10:31 MDT 2017",
bibsource = "http://www.acm.org/pubs/contents/journals/jetc/;
https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "The rapid evolution of the embedded era has witnessed
globalization for the design of SoC architectures in
the semiconductor design industry. Though issues of
cost and stringent marketing deadlines have been
resolved in such a methodology, yet the root of
hardware trust has been evicted. Malicious circuitry,
a.k.a. Hardware Trojan Horse (HTH), is inserted by
adversaries in the less trusted phases of design. A HTH
remains dormant during testing but gets triggered at
runtime to cause sudden active and passive attacks. In
this work, we focus on the runtime passive threats
based on the parameter delay. Nature-inspired
algorithms offer an alternative to the conventional
techniques for solving complex problems in the domain
of computer science. However, most are optimization
techniques and none is dedicated to security. We seek
refuge to the crypsis behavior exhibited by geckos in
nature to generate a runtime security technique for SoC
architectures, which can bypass runtime passive threats
of a HTH. An adaptive security intellectual property
(IP) that works on the proposed security principles is
designed. Embedded timing analysis is used for
experimental validation. Low area and power overhead of
our proposed security IP over standard benchmarks and
practical crypto SoC architectures as obtained in
experimental results supports its applicability for
practical implementations.",
acknowledgement = ack-nhfb,
articleno = "41",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Liu:2017:CPU,
author = "Yin Liu and Keshab K. Parhi",
title = "Computing Polynomials Using Unipolar Stochastic
Logic",
journal = j-JETC,
volume = "13",
number = "3",
pages = "42:1--42:??",
month = may,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3007648",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Tue Jul 11 17:10:31 MDT 2017",
bibsource = "http://www.acm.org/pubs/contents/journals/jetc/;
https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "This article addresses subtraction and polynomial
computations using unipolar stochastic logic.
Stochastic computing requires simple logic gates, and
stochastic logic--based circuits are inherently fault
tolerant. Thus, these structures are well suited for
nanoscale CMOS technologies. It is well known that an
AND gate and a multiplexer can be used to implement
stochastic unipolar multiplier and adder, respectively.
Although it is easy to realize multiplication and
scaled addition, implementation of subtraction is
nontrivial using unipolar stochastic logic.
Additionally, an accurate computation of subtraction is
critical for the implementation of polynomials with
negative coefficients in stochastic unipolar
representation. This work, for the first time,
demonstrates that instead of using well-known Bernstein
polynomials, stochastic computation of polynomials can
be implemented by using a stochastic subtractor and
factorization. Three major contributions are given in
this article. First, two approaches are proposed to
compute subtraction in stochastic unipolar
representation. In the first approach, the subtraction
operation is approximated by cascading multilevels of
OR and AND gates. The accuracy of the approximation is
improved with the increase in the number of stages. In
the second approach, the stochastic subtraction is
implemented using a multiplexer and a stochastic
divider. This approach requires more hardware
complexity due to the use of a linear-feedback shift
register and a counter for division. Second,
computation of polynomials in stochastic unipolar
format is presented using scaled addition and proposed
stochastic subtraction. Third, we propose stochastic
computation of polynomials using factorization.
Stochastic implementations of first- and second-order
factors are presented for different locations of
polynomial roots. From experimental results, it is
shown that the proposed stochastic logic circuits
require less hardware complexity than the previous
stochastic polynomial implementation using Bernstein
polynomials.",
acknowledgement = ack-nhfb,
articleno = "42",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Golnari:2017:PCE,
author = "Pareesa Ameneh Golnari and Yavuz Yetim and Margaret
Martonosi and Yakir Vizel and Sharad Malik",
title = "{PPU}: a Control Error-Tolerant Processor for
Streaming Applications with Formal Guarantees",
journal = j-JETC,
volume = "13",
number = "3",
pages = "43:1--43:??",
month = may,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/2990502",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Tue Jul 11 17:10:31 MDT 2017",
bibsource = "http://www.acm.org/pubs/contents/journals/jetc/;
https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "With increasing technology scaling and design
complexity there are increasing threats from device and
circuit failures. This is expected to worsen with
post-CMOS devices. Current error-resilient solutions
ensure reliability of circuits through protection
mechanisms such as redundancy, error correction, and
recovery. However, the costs of these solutions may be
high, rendering them impractical. In contrast,
error-tolerant solutions allow errors in the
computation and are positioned to be suitable for
error-tolerant applications such as media applications.
For such programmable error-tolerant processors, the
Instruction-Set-Architecture (ISA) no longer serves as
a specification since it is acceptable for the
processor to allow for errors during the execution of
instructions. In this work, we address this
specification gap by defining the basic requirements
needed for an error-tolerant processor to provide
acceptable results. Furthermore, we formally define
properties that capture these requirements. Based on
this, we propose the Partially Protected Uniprocessor
(PPU), an error-tolerant processor that aims to meet
these requirements with low-cost microarchitectural
support. These protection mechanisms convert
potentially fatal control errors to potentially
tolerable data errors instead of ensuring
instruction-level or byte-level correctness. The
protection mechanisms in PPU protect the system against
crashes, unresponsiveness, and external device
corruption. In addition, they also provide support for
achieving acceptable result quality. Additionally, we
provide a methodology that formally proves the
specification properties on PPU using model checking.
This methodology uses models for the hardware and
software that are integrated with the fault and
recovery models. Finally, we experimentally demonstrate
the results of model checking and the application-level
quality of results for PPU.",
acknowledgement = ack-nhfb,
articleno = "43",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Gorantla:2017:DAC,
author = "Anusha Gorantla and Deepa P.",
title = "Design of Approximate Compressors for Multiplication",
journal = j-JETC,
volume = "13",
number = "3",
pages = "44:1--44:??",
month = may,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3007649",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Tue Jul 11 17:10:31 MDT 2017",
bibsource = "http://www.acm.org/pubs/contents/journals/jetc/;
https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Approximate computing is a promising technique for
energy-efficient Very Large Scale Integration (VLSI)
system design. It is best suited for error-resilient
applications such as signal processing and multimedia.
Approximate computing reduces accuracy but still
provides significant and faster results with lower
power consumption. This is attractive to arithmetic
circuits. In this article, various novel design
approaches of approximate 4-2 and 5-2 compressors have
been proposed for reduction of the partial product
stages in multiplication. Three approximate 8 $ \times
$ 8 Dadda multiplier designs using three novel
approximate 4-2 compressors and two approximate 8 $
\times $ 8 Dadda multiplier designs using two novel
approximate 5-2 compressors have proposed. The
synthesis results show that the proposed designs
achieved significant accuracy improvement together with
power and delay reductions compared to the existing
approximate designs.",
acknowledgement = ack-nhfb,
articleno = "44",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Kumar:2017:THS,
author = "Arvind Kumar and Zhe Wan and Winfried W. Wilcke and
Subramanian S. Iyer",
title = "Toward Human-Scale Brain Computing Using {$3$D} Wafer
Scale Integration",
journal = j-JETC,
volume = "13",
number = "3",
pages = "45:1--45:??",
month = may,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/2976742",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Tue Jul 11 17:10:31 MDT 2017",
bibsource = "http://www.acm.org/pubs/contents/journals/jetc/;
https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "The Von Neumann architecture, defined by strict and
hierarchical separation of memory and processor, has
been a hallmark of conventional computer design since
the 1940s. It is becoming increasingly unsuitable for
cognitive applications, which require massive parallel
processing of highly interdependent data. Inspired by
the brain, we propose a significantly different
architecture characterized by a large number of highly
interconnected simple processors intertwined with very
large amounts of low-latency memory. We contend that
this memory-centric architecture can be realized using
3D wafer scale integration for which the technology is
nearing readiness, combined with current CMOS device
technologies. The natural fault tolerance and lower
power requirements of neuromorphic processing make 3D
wafer stacking particularly attractive. In order to
assess the performance of this architecture, we propose
a specific embodiment of a neuronal system using 3D
wafer scale integration; formulate a simple model of
brain connectivity including short- and long-range
connections; and estimate the memory, bandwidth,
latency, and power requirements of the system using the
connectivity model. We find that 3D wafer scale
integration, combined with technologies nearing
readiness, offers the potential for scaleup to a
primate-scale brain, while further scaleup to a
human-scale brain would require significant additional
innovations.",
acknowledgement = ack-nhfb,
articleno = "45",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Alawad:2017:SCS,
author = "Mohammed Alawad and Mingjie Lin",
title = "Sketching Computation with Stochastic Processing
Engines",
journal = j-JETC,
volume = "13",
number = "3",
pages = "46:1--46:??",
month = may,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3007652",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Tue Jul 11 17:10:31 MDT 2017",
bibsource = "http://www.acm.org/pubs/contents/journals/jetc/;
https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "This article explores how to leverage stochastic
principles to gracefully exploit partial computation
results, hence achieving quality-scalable embedded
computing. Our work is inspired by the concept of
incremental sketching frequently found in artistic
rendering, where the drawing procedure consists of a
series of steps, each gradually improving the quality
of results. The essence of our approach is to first
encode input signals as probability density functions
(PDFs), then perform stochastic computing operations on
all signals in the probabilistic domain, and finally
decode output signals by estimating the PDF of these
resulting random samples. Although numerous approximate
computing schemes exist, such as inaccurate adders and
multipliers that reduce bit width or weaken logic
circuit design, none of them can seamlessly improve
computing accuracy incrementally without making any
changes to the computing hardware at runtime.
Furthermore, in conventional embedded computing, a
sudden shortage of computing resources, such as
premature termination, often means a complete computing
failure and totally unusable results. Our sketching
computing scheme can readily trade off between the
quality of results and computing efforts without
modifying its circuit design. To validate our proposed
architecture design, we have implemented a
proof-of-concept computation sketching engine based on
a probabilistic convolver using a Virtex-6 FPGA device.
Using three widely deployed image processing
applications-image correspondence, image sharpening,
and edge detection-we have demonstrated that important
embedded computing applications can indeed be
``sketched'' in a graceful manner using roughly one
third the hardware and one fifth the energy compared to
the traditional multiplier-based computing method.",
acknowledgement = ack-nhfb,
articleno = "46",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Alaghi:2017:TAE,
author = "Armin Alaghi and Wei-Ting J. Chan and John P. Hayes
and Andrew B. Kahng and Jiajia Li",
title = "Trading Accuracy for Energy in Stochastic Circuit
Design",
journal = j-JETC,
volume = "13",
number = "3",
pages = "47:1--47:??",
month = may,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/2990503",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Tue Jul 11 17:10:31 MDT 2017",
bibsource = "http://www.acm.org/pubs/contents/journals/jetc/;
https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "As we approach the limits of traditional Moore's-Law
scaling, alternative computing techniques that consume
energy more efficiently become attractive. Stochastic
computing (SC), as a re-emerging computing technique,
is a low-cost and error-tolerant alternative to
conventional binary circuits in several important
applications such as image processing and
communications. SC allows a natural accuracy-energy
tradeoff that has been exploited in the past. This
article presents an accuracy-energy tradeoff technique
for SC circuits that reduces their energy consumption
with virtually no accuracy loss. To this end, we employ
voltage or frequency scaling, which normally reduce
energy consumption at the cost of timing errors. Then
we show that due to their inherent error tolerance, SC
circuits operate satisfactorily without significant
accuracy loss even with aggressive scaling. This
significantly improves their energy efficiency. In
contrast, conventional binary circuits quickly fail as
the supply voltage decreases. To find the most
energy-efficient operating point of an SC circuit, we
propose an error estimation method that allows us to
quickly explore the circuit's design space. The error
estimation method is based on Markov chain and
least-squares regression. Furthermore, we investigate
opportunities to optimize SC circuits under such
aggressive scaling. We find that logical and physical
design techniques can be combined to significantly
expand the already-powerful accuracy-energy tradeoff
possibilities of SC. In particular, we demonstrate that
careful adjustment of path delays can lead to
significant error reduction under voltage and frequency
scaling. We perform buffer insertion and route
detouring to achieve more balanced path delays. These
techniques differ from conventional path-balancing
techniques whose goal is to minimize power consumption
by resizing the non-critical paths. The goal of our
path-balancing approach is to increase error
cancellation chances in voltage-/frequency-scaled SC
circuits. Our circuit optimization comprehends the
tradeoff between power overheads due to inserted
buffers and wires versus the energy reduction from
supply voltage downscaling enabled by more balanced
path delays. Simulation results show that our optimized
SC circuits can tolerate aggressive voltage scaling
with no significant signal-to-noise ratio (SNR)
degradation. In one example, a 40\% supply voltage
reduction (1V to 0.6V) on the SC circuit leads to 66\%
energy saving (20.7pJ to 6.9pJ) and makes it more
efficient than its conventional binary counterpart. In
the same example, a 100\% frequency boosting (400ps to
200ps) of the optimized circuits leads to no
significant SNR degradation. We also show that process
variation and temperature variation have limited impact
on optimized SC circuits. The error change is less than
5\% when temperature changes by 100${}^\circ $C or
process condition changes from worst case to best
case.",
acknowledgement = ack-nhfb,
articleno = "47",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Salehi:2017:SSM,
author = "Soheil Salehi and Deliang Fan and Ronald F. Demara",
title = "Survey of {STT--MRAM} Cell Design Strategies: Taxonomy
and Sense Amplifier Tradeoffs for Resiliency",
journal = j-JETC,
volume = "13",
number = "3",
pages = "48:1--48:??",
month = may,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/2997650",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Tue Jul 11 17:10:31 MDT 2017",
bibsource = "http://www.acm.org/pubs/contents/journals/jetc/;
https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Spin-Transfer Torque Random Access Memory (STT-MRAM)
has been explored as a post-CMOS technology for
embedded and data storage applications seeking
non-volatility, near-zero standby energy, and high
density. Towards attaining these objectives for
practical implementations, various techniques to
mitigate the specific reliability challenges associated
with STT-MRAM elements are surveyed, classified, and
assessed in this article. Cost and suitability metrics
assessed include the area of nanomagmetic and CMOS
components per bit, access time and complexity, sense
margin, and energy or power consumption costs versus
resiliency benefits. Solutions to the reliability
issues identified are addressed within a taxonomy
created to categorize the current and future approaches
to reliable STT-MRAM designs. A variety of destructive
and non-destructive sensing schemes are assessed for
process variation tolerance, read disturbance
reduction, sense margin, and write polarization
asymmetry compensation. The highest resiliency
strategies deliver a sensing margin above 300mV while
incurring low power and energy consumption on the order
of picojoules and microwatts, respectively, and
attaining read sense latency of a few nanoseconds down
to hundreds of picoseconds for non-destructive and
destructive sensing schemes, respectively.",
acknowledgement = ack-nhfb,
articleno = "48",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Yu:2017:RMA,
author = "Songping Yu and Nong Xiao and Mingzhu Deng and Fang
Liu and Wei Chen",
title = "Redesign the Memory Allocator for Non-Volatile Main
Memory",
journal = j-JETC,
volume = "13",
number = "3",
pages = "49:1--49:??",
month = may,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/2997651",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Tue Jul 11 17:10:31 MDT 2017",
bibsource = "http://www.acm.org/pubs/contents/journals/jetc/;
https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "The non-volatile memory (NVM) has the merits of
byte-addressability, fast speed, persistency and low
power consumption, which make it attractive to be used
as main memory. Commonly, user process dynamically
acquires memory through memory allocators. However,
traditional memory allocators designed with in-place
data writes are not appropriate for the non-volatile
main memory (NVRAM) due to the limited endurance. In
this article, first, we quantitatively analyze the
wear-oblivious of DRAM-oriented designed
allocator-glibc malloc and the inefficiency of
wear-conscious allocator NVMalloc. Then, we propose
WAlloc, an efficient wear-aware manual memory allocator
designed for NVRAM: (1) decouples metadata and data
management; (2) distinguishes metadata with volatility;
(3) redirects the data writes around to achieve
wear-leveling; (4) redesigns an efficient and effective
NVM copy mechanism, bypassing the CPU cache partially
and prefetching data explicitly. Finally, experimental
results show that the wear-leveling of WAlloc
outperforms that of NVMalloc about 30\% and 60\% under
random workloads and well-distributed workloads,
respectively. Besides, WAlloc reduces the average data
memory writes in 64 bytes block by 1.5 times comparing
with glibc malloc. With the fulfillment of data
persistency, cache bypassing NVM copy is better than
cache line flushing NVM copy with performance
improvement circa 14\%.",
acknowledgement = ack-nhfb,
articleno = "49",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Li:2017:PUD,
author = "Bing Li and Yu Hu and Ying Wang and Jing Ye and
Xiaowei Li",
title = "Power-Utility-Driven Write Management for {MLC PCM}",
journal = j-JETC,
volume = "13",
number = "3",
pages = "50:1--50:??",
month = may,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/2997648",
ISSN = "1550-4832 (print), 1550-4840 (electronic)",
ISSN-L = "1550-4832",
bibdate = "Tue Jul 11 17:10:31 MDT 2017",
bibsource = "http://www.acm.org/pubs/contents/journals/jetc/;
https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Phase change memory (PCM) is a promising alternative
to Dynamic Random Access Memory (DRAM) as main memory
due to its merits of high density and low leakage
power. Multi-level Cell (MLC) PCM is more attractive
than Single-level Cell (SLC) PCM, because it can store
multiple bits per cell to achieve higher density and
lower per-bit cost. With the iterative program-verify
write technique, MLC PCM writes demand at much higher
power than DRAM writes, while the power supply system
of MLC memory system is similar to that of DRAM, and
the power capability is limited. The incompatibility of
high write power and limited power budget results in
the degradation of the write throughput and performance
in MLC PCM. In this work, we investigate both write
scheduling policy and power management to improve the
MLC power utility and alleviate the negative impacts
induced by high write power. We identify the
power-utility-driven write scheduling as an online
bin-packing problem and then derive a
power-utility-driven scheduling (PUDS) policy from the
First Fit algorithm to improve the write power usage.
Based on the ramp-down characteristic of the SET pulse
(the pulse changes the PCM to high resistance), we
propose the SET Power Amortization (SPA) policy, which
proactively reclaims the power tokens at the intra-SET
level to promote the power utilization. Our
experimental results demonstrate that the PUDS and SPA
respectively achieve 24\% and 27\% performance
improvement over the state-of-the-art power management
technique, and the PUDS8SPA has an overall 31\%
improvement of the power utility and 50\% increase of
performance compared to the baseline system.",
acknowledgement = ack-nhfb,
articleno = "50",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Ghosh:2017:AQC,
author = "Mrityunjay Ghosh and Amlan Chakrabarti and Niraj K.
Jha",
title = "Automated Quantum Circuit Synthesis and Cost
Estimation for the Binary Welded Tree Oracle",
journal = j-JETC,
volume = "13",
number = "4",
pages = "51:1--51:??",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3060582",
ISSN = "1550-4832",
bibdate = "Sat Aug 12 09:05:32 MDT 2017",
bibsource = "http://www.acm.org/pubs/contents/journals/jetc/;
https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Quantum computing is a new computational paradigm that
promises an exponential speed-up over classical
algorithms. To develop efficient quantum algorithms for
problems of a non-deterministic nature, random walk is
one of the most successful concepts employed. In this
article, we target both continuous-time and
discrete-time random walk in both the classical and
quantum regimes. Binary Welded Tree (BWT), or glued
tree, is one of the most well-known quantum walk
algorithms in the continuous-time domain. Prior work
implements quantum walk on the BWT with static welding.
In this context, static welding is randomized but
case-specific. We propose a solution to automatically
generate the circuit for the Oracle for welding. We
implement the circuit using the Quantum Assembly
Language, which is a language for describing quantum
circuits. We then optimize the generated circuit using
the Fault-Tolerant Quantum Logic Synthesis tool for any
BWT instance. Automatic welding enables us to provide a
generalized solution for quantum walk on the BWT.",
acknowledgement = ack-nhfb,
articleno = "51",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Govindaraj:2017:DAS,
author = "Rekha Govindaraj and Swaroop Ghosh",
title = "Design and Analysis of {STTRAM}-Based Ternary Content
Addressable Memory Cell",
journal = j-JETC,
volume = "13",
number = "4",
pages = "52:1--52:??",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3060578",
ISSN = "1550-4832",
bibdate = "Sat Aug 12 09:05:32 MDT 2017",
bibsource = "http://www.acm.org/pubs/contents/journals/jetc/;
https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Content Addressable Memory (CAM) is widely used in
applications where searching a specific pattern of data
is a major operation. Conventional CAMs suffer from
area, power, and speed limitations. We propose
Spin-Torque Transfer RAM--based Ternary CAM (TCAM)
cells. The proposed NOR-type TCAM cell has a 62.5\%
(33\%) reduction in number of transistor compared to
conventional CMOS TCAMs (spintronic TCAMs). We analyzed
the sense margin of the proposed TCAM with respect to
16-, 32-, 64-, 128-, and 256-bit word sizes in 22nm
predictive technology. Simulations indicated a reliable
sense margin of 50mV even at 0.7V supply voltage for
256-bits word. We also explored a selective threshold
voltage modulation of transistors to improve the sense
margin and tolerate process and voltage variations. The
worst-case search latency and sense margin of 256-bit
TCAM is found to be 263ps and 220mV, respectively, at
1V supply voltage. The average search power consumed is
13mW, and the search energy is 4.7fJ/bit search. The
write time is 4ns, and the write energy is 0.69pJ/bit.
We leverage the NOR-type TCAM design to realize a 9T-2
Magnetic Tunnel Junctions NAND-type TCAM cell that has
43.75\% less number of transistors than the
conventional CMOS TCAM cell. A NAND-type cell can
support up to 64-bit words with a maximum sense margin
of up to 33mV. We compare the performance metrics of
NOR- and NAND-type TCAM cells with other TCAMs in the
literature.",
acknowledgement = ack-nhfb,
articleno = "52",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Peter:2017:OON,
author = "Eldhose Peter and Anuj Arora and Janibul Bashir and
Akriti Bagaria and Smruti R. Sarangi",
title = "Optical Overlay {NUCA}: a High-Speed Substrate for
Shared {L2} Caches",
journal = j-JETC,
volume = "13",
number = "4",
pages = "53:1--53:??",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3064833",
ISSN = "1550-4832",
bibdate = "Sat Aug 12 09:05:32 MDT 2017",
bibsource = "http://www.acm.org/pubs/contents/journals/jetc/;
https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "In this article, we propose using optical
networks-on-chip (NoCs) to design cache access
protocols for large shared L2 caches. We observe that
the problem is unique because optical networks have
very low latency, and in principle all of the cache
banks are very close to each other. A naive approach is
to broadcast a request to a set of banks that might
possibly contain the copy of a block. However, this
approach is wasteful in terms of energy and bandwidth.
Hence, we propose a set of novel schemes that create a
set of virtual networks ( overlays ) of cache banks
over a physical optical NoC. We search for a block
inside each overlay using a combination of multicast
and unicast messages. We first propose two simple
protocols: TSI and Broadcast. The former uses unicast
messages, and the latter uses multicast messages. We
subsequently propose an improved scheme, OP\_BCAST,
that combines the best of TSI and Broadcast, and mainly
uses restricted multicast messages. Then we propose a
set of novel hardware structures for creating and
managing overlays, for efficiently locating blocks in
the overlay, and for implementing dynamically changing
overlays with OP\_BCAST. The performance of the TSI
scheme is within 2\% to 3\% of a broadcast scheme, and
it is faster than traditional schemes with electrical
networks by 26\%. Compared to the broadcast scheme, it
reduces the number of accesses, and consequently the
dynamic energy of the caches by 6\% to 8\%. OP\_BCAST
is 34\% faster than the best solutions with
copper-based NoCs; moreover, it reduces the dynamic
energy for cache access by 33\% compared to the TSI
scheme.",
acknowledgement = ack-nhfb,
articleno = "53",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Koneru:2017:IEC,
author = "Abhishek Koneru and Sukeshwar Kannan and Krishnendu
Chakrabarty",
title = "Impact of Electrostatic Coupling and Wafer-Bonding
Defects on Delay Testing of Monolithic {$3$D}
Integrated Circuits",
journal = j-JETC,
volume = "13",
number = "4",
pages = "54:1--54:??",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3041026",
ISSN = "1550-4832",
bibdate = "Sat Aug 12 09:05:32 MDT 2017",
bibsource = "http://www.acm.org/pubs/contents/journals/jetc/;
https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Monolithic three-dimensional (M3D) integration is
gaining momentum, as it has the potential to achieve
significantly higher device density compared to 3D
integration based on through-silicon vias. M3D
integration uses several techniques that are not used
in the fabrication of conventional integrated circuits
(ICs). Therefore, a detailed analysis of the M3D
fabrication process is required to understand the
impact of defects that are likely to occur during chip
fabrication. In this article, we first analyze
electrostatic coupling in M3D ICs, which arises due to
the aggressive scaling of the interlayer dielectric
(ILD) thickness. We then analyze defects that arise due
to voids created during wafer bonding, a key step in
most M3D fabrication processes. We quantify the impact
of these defects on the threshold voltage of a
top-layer transistor in an M3D IC. We also show that
wafer-bonding defects can lead to a change in the
resistance of interlayer vias (ILVs), and in some cases
lead to an open in an ILV or a short between two ILVs.
We then analyze the impact of these defects on path
delays using HSpice simulations. We study their impact
on the effectiveness of delay-test patterns for
multiple instances of IWLS 2005 benchmarks in which
these defects were randomly injected. Our results show
that the timing characteristics of an M3D IC can be
significantly altered due to coupling and wafer-bonding
defects if the thickness of its ILD is less than 100nm.
Therefore, for such M3D ICs, test-generation methods
must be enhanced to take M3D fabrication defects into
account.",
acknowledgement = ack-nhfb,
articleno = "54",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Houshmand:2017:QCS,
author = "Mahboobeh Houshmand and Mehdi Sedighi and Morteza
Saheb Zamani and Kourosh Marjoei",
title = "Quantum Circuit Synthesis Targeting to Improve One-Way
Quantum Computation Pattern Cost Metrics",
journal = j-JETC,
volume = "13",
number = "4",
pages = "55:1--55:??",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3064834",
ISSN = "1550-4832",
bibdate = "Sat Aug 12 09:05:32 MDT 2017",
bibsource = "http://www.acm.org/pubs/contents/journals/jetc/;
https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "One-way quantum computation (1WQC) is a model of
universal quantum computations in which a specific
highly entangled state called a cluster state allows
for quantum computation by single-qubit measurements.
The needed computations in this model are organized as
measurement patterns. The traditional approach to
obtain a measurement pattern is by translating a
quantum circuit that solely consists of CZ and $
J(\alpha) $ gates into the corresponding measurement
patterns and then performing some optimizations by
using techniques proposed for the 1WQC model. However,
in these cases, the input of the problem is a quantum
circuit, not an arbitrary unitary matrix. Therefore, in
this article, we focus on the first phase-that is,
decomposing a unitary matrix into CZ and $ J(\alpha) $
gates. Two well-known quantum circuit synthesis
methods, namely cosine-sine decomposition and quantum
Shannon decomposition are considered and then adapted
for a library of gates containing CZ and $ J(\alpha) $,
equipped with optimizations. By exploring the solution
space of the combinations of these two methods in a
bottom-up approach of dynamic programming, a
multiobjective quantum circuit synthesis method is
proposed that generates a set of quantum circuits. This
approach attempts to simultaneously improve the
measurement pattern cost metrics after the translation
from this set of quantum circuits.",
acknowledgement = ack-nhfb,
articleno = "55",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Yogendra:2017:CST,
author = "Karthik Yogendra and Chamika Liyanagedera and Deliang
Fan and Yong Shim and Kaushik Roy",
title = "Coupled Spin-Torque Nano-Oscillator-Based Computation:
a Simulation Study",
journal = j-JETC,
volume = "13",
number = "4",
pages = "56:1--56:??",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3064835",
ISSN = "1550-4832",
bibdate = "Sat Aug 12 09:05:32 MDT 2017",
bibsource = "http://www.acm.org/pubs/contents/journals/jetc/;
https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "In this article, we present a comprehensive study of
four frequency locking mechanisms in Spin Torque Nano
Oscillators (STNOs) and explore their suitability for a
class of specialized computing applications. We
implemented a physical STNO model based on
Landau--Lifshitz--Gilbert-Slonczewski equation and
benchmarked the model to experimental data. Based on
our simulations, we provide an in-depth analysis of how
the ``self-organizing'' ability of coupled STNO array
can be effectively used for computations that are
unsuitable or inefficient in the von-Neumann computing
domain. As a case study, we demonstrate the computing
ability of coupled STNOs with two applications: edge
detection of an image and associative computing for
image recognition. We provide an analysis of the
scaling trends of STNOs and the effectiveness of
different frequency locking mechanisms with scaling in
the presence of thermal noise. We also provide an
in-depth analysis of the effect of variations on the
four locking mechanisms to find the most robust one in
the presence of variations.",
acknowledgement = ack-nhfb,
articleno = "56",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Najafi:2017:RAS,
author = "M. Hassan Najafi and Peng Li and David J. Lilja and
Weikang Qian and Kia Bazargan and Marc Riedel",
title = "A Reconfigurable Architecture with Sequential
Logic-Based Stochastic Computing",
journal = j-JETC,
volume = "13",
number = "4",
pages = "57:1--57:??",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3060537",
ISSN = "1550-4832",
bibdate = "Sat Aug 12 09:05:32 MDT 2017",
bibsource = "http://www.acm.org/pubs/contents/journals/jetc/;
https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Computations based on stochastic bit streams have
several advantages compared to deterministic binary
radix computations, including low power consumption,
low hardware cost, high fault tolerance, and skew
tolerance. To take advantage of this computing
technique, previous work proposed a combinational
logic-based reconfigurable architecture to perform
complex arithmetic operations on stochastic streams of
bits. The long execution time and the cost of
converting between binary and stochastic
representations, however, make the stochastic
architectures less energy efficient than the
deterministic binary implementations. This article
introduces a methodology for synthesizing a given
target function stochastically using finite-state
machines (FSMs), and enhances and extends the
reconfigurable architecture using sequential logic.
Compared to the previous approach, the proposed
reconfigurable architecture can save hardware area and
energy consumption by up to 30\% and 40\%,
respectively, while achieving a higher processing
speed. Both stochastic reconfigurable architectures are
much more tolerant of soft errors (bit flips) than the
deterministic binary radix implementations, and their
fault tolerance scales gracefully to very large numbers
of errors.",
acknowledgement = ack-nhfb,
articleno = "57",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Chittamuru:2017:SRS,
author = "Sai Vineel Reddy Chittamuru and Srinivas Desai and
Sudeep Pasricha",
title = "{SWIFTNoC}: a Reconfigurable Silicon-Photonic Network
with Multicast-Enabled Channel Sharing for Multicore
Architectures",
journal = j-JETC,
volume = "13",
number = "4",
pages = "58:1--58:??",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3060517",
ISSN = "1550-4832",
bibdate = "Sat Aug 12 09:05:32 MDT 2017",
bibsource = "http://www.acm.org/pubs/contents/journals/jetc/;
https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "On-chip communication is widely considered to be one
of the major performance bottlenecks in contemporary
chip multiprocessors (CMPs). With recent advances in
silicon nanophotonics, photonics-based network-on-chip
(NoC) architectures are being considered as a viable
solution to support communication in future CMPs as
they can enable higher bandwidth and lower power
dissipation compared to traditional electrical NoCs. In
this article, we present SwiftNoC, a novel
reconfigurable silicon-photonic NoC architecture that
features improved multicast-enabled channel sharing, as
well as dynamic re-prioritization and exchange of
bandwidth between clusters of cores running multiple
applications, to increase channel utilization and
system performance. Experimental results show that
SwiftNoC improves throughput by up to $ 25.4 \times $
while reducing latency by up to 72.4\% and
energy-per-bit by up to 95\% over state-of-the-art
solutions.",
acknowledgement = ack-nhfb,
articleno = "58",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Samal:2017:IPU,
author = "Sandeep Kumar Samal and Guoqing Chen and Sung Kyu
Lim",
title = "Improving Performance under Process and Voltage
Variations in Near-Threshold Computing Using {$3$D}
{ICs}",
journal = j-JETC,
volume = "13",
number = "4",
pages = "59:1--59:??",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3060579",
ISSN = "1550-4832",
bibdate = "Sat Aug 12 09:05:32 MDT 2017",
bibsource = "http://www.acm.org/pubs/contents/journals/jetc/;
https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Near-threshold computing (NTC) circuits have been
shown to offer significant energy efficiency and power
benefits but with a huge performance penalty. This
performance loss exacerbates if process and voltage
variations are considered. In this article, we
demonstrate that three-dimensional (3D) IC technology
can overcome this limitation. We present a detailed
case study with a 28nm commercial-grade core at 0.6V
operation optimized with various 3D IC physical design
methods. First, our study under the deterministic case
shows that 3D IC NTC design outperforms 2D IC NTC by
29.5\% in terms of performance at comparable energy.
This is significantly higher than the 12.8\%
performance benefit of 3D IC at nominal voltage
supplies due to higher delay sensitivity to input slew
at lower voltages. Second, it is well demonstrated that
transistor delay is more sensitive to voltage changes
at NTC operation. However, our full-chip study reveals
that IR drop effect on 2D/3D IC NTC performance is not
severe due to the low power consumption and hence lower
IR drop values. Third, die-to-die variation impact on
full-chip performance is visible in 3D IC NTC designs,
but it is not worse compared to 2D IC NTC designs. This
is mainly due to the shorter critical path length in 3D
IC NTC designs.",
acknowledgement = ack-nhfb,
articleno = "59",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Jiang:2017:RCC,
author = "Honglan Jiang and Cong Liu and Leibo Liu and Fabrizio
Lombardi and Jie Han",
title = "A Review, Classification, and Comparative Evaluation
of Approximate Arithmetic Circuits",
journal = j-JETC,
volume = "13",
number = "4",
pages = "60:1--60:??",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3094124",
ISSN = "1550-4832",
bibdate = "Sat Aug 12 09:05:32 MDT 2017",
bibsource = "http://www.acm.org/pubs/contents/journals/jetc/;
https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Often as the most important arithmetic modules in a
processor, adders, multipliers, and dividers determine
the performance and energy efficiency of many computing
tasks. The demand of higher speed and power efficiency,
as well as the feature of error resilience in many
applications (e.g., multimedia, recognition, and data
analytics), have driven the development of approximate
arithmetic design. In this article, a review and
classification are presented for the current designs of
approximate arithmetic circuits including adders,
multipliers, and dividers. A comprehensive and
comparative evaluation of their error and circuit
characteristics is performed for understanding the
features of various designs. By using approximate
multipliers and adders, the circuit for an image
processing application consumes as little as 47\% of
the power and 36\% of the power-delay product of an
accurate design while achieving similar image
processing quality. Improvements in delay, power, and
area are obtained for the detection of differences in
images by using approximate dividers.",
acknowledgement = ack-nhfb,
articleno = "60",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Li:2017:EEC,
author = "Hui Li and S{\'e}bastien {Le Beux} and Martha Johanna
Sepulveda and Ian O'Connor",
title = "Energy-Efficiency Comparison of Multi-Layer Deposited
Nanophotonic Crossbar Interconnects",
journal = j-JETC,
volume = "13",
number = "4",
pages = "61:1--61:??",
month = aug,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3094125",
ISSN = "1550-4832",
bibdate = "Sat Aug 12 09:05:32 MDT 2017",
bibsource = "http://www.acm.org/pubs/contents/journals/jetc/;
https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Single-layer optical crossbar interconnections based
on Wavelength Division Multiplexing stand among other
nanophotonic interconnects because of their low latency
and low power. However, such architectures suffer from
a poor scalability due to losses induced by long
propagation distances on waveguides and waveguide
crossings. Multi-layer deposited silicon technology
allows the stacking of optical layers that are
connected by means of Optical Vertical Couplers. This
allows significant reduction in the optical losses,
which contributes to improve the interconnect
scalability but also leads to new challenges related to
network designs and layouts. In this article, we
investigate the design of optical crossbars using
multi-layer silicon deposited technology. We propose
implementations for Ring-, Matrix-, $ \lambda
$-router-, and Snake-based topologies. Layouts avoiding
waveguide crossings are compared to those minimizing
the waveguide length according to worst-case and
average losses. The laser output power is estimated
from the losses, which allows us to evaluate the energy
efficiency improvement induced by multi-layer
technology over traditional planar implementations
(33\% on average). Finally, networks comparison has
been carried out and the results show that the ring
topology leads to a 43\% reduction in the laser output
power.",
acknowledgement = ack-nhfb,
articleno = "61",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Gala:2018:ATN,
author = "Neel Gala and Sarada Krithivasan and Wei-Yu Tsai and
Xueqing Li and Vijaykrishnan Narayanan and V.
Kamakoti",
title = "An Accuracy Tunable Non-{Boolean} Co-Processor Using
Coupled Nano-Oscillators",
journal = j-JETC,
volume = "14",
number = "1",
pages = "1:1--1:??",
month = mar,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3094263",
ISSN = "1550-4832",
bibdate = "Thu Nov 1 16:44:40 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "As we enter an era witnessing the closer end of
Dennard scaling, where further reduction in power
supply-voltage to reduce power consumption becomes more
challenging in conventional systems, a goal of
developing a system capable of performing large
computations with minimal area and power overheads
needs more optimization aspects. A rigorous exploration
of alternate computing techniques, which can mitigate
the limitations of Complementary Metal-Oxide
Semiconductor (CMOS) technology scaling and
conventional Boolean systems, is imperative. Reflecting
on these lines of thought, in this article we explore
the potential of non-Boolean computing employing
nano-oscillators for performing varied functions. We
use a two coupled nano-oscillator as our basic
computational model and propose an architecture for a
non-Boolean coupled oscillator based co-processor
capable of executing certain functions that are
commonly used across a variety of approximate
application domains. The proposed architecture includes
an accuracy tunable knob, which can be tuned by the
programmer at runtime. The functionality of the
proposed co-processor is verified using a soft coupled
oscillator model based on Kuramoto oscillators. The
article also demonstrates how real-world applications
such as Vector Quantization, Digit Recognition,
Structural Health Monitoring, and the like, can be
deployed on the proposed model. The proposed
co-processor architecture is generic in nature and can
be implemented using any of the existing modern day
nano-oscillator technologies such as Resonant Body
Transistors (RBTs), Spin-Torque Nano-Oscillators
(STNOs), and Metal-Insulator Transition (MITs). In
this article, we perform a validation of the proposed
architecture using the HyperField Effect Transistor
(FET) technology-based coupled oscillators, which
provide improvements of up to $ 3.5 \times $ increase
in clock speed and up to $ 10.75 \times $ and $ 14.12
\times $ reduction in area and power consumption,
respectively, as compared to a conventional Boolean
CMOS accelerator executing the same functions.",
acknowledgement = ack-nhfb,
articleno = "1",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Uddin:2018:DCM,
author = "Mesbah Uddin and MD. Badruddoja Majumder and Karsten
Beckmann and Harika Manem and Zahiruddin Alamgir and
Nathaniel C. Cady and Garrett S. Rose",
title = "Design Considerations for Memristive Crossbar Physical
Unclonable Functions",
journal = j-JETC,
volume = "14",
number = "1",
pages = "2:1--2:??",
month = mar,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3094414",
ISSN = "1550-4832",
bibdate = "Thu Nov 1 16:44:40 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Hardware security has emerged as a field concerned
with issues such as integrated circuit (IC)
counterfeiting, cloning, piracy, and reverse
engineering. Physical unclonable functions (PUF) are
hardware security primitives useful for mitigating such
issues by providing hardware-specific fingerprints
based on intrinsic process variations within individual
IC implementations. As technology scaling progresses
further into the nanometer region, emerging
nanoelectronic technologies, such as memristors or
RRAMs (resistive random-access memory), have become
interesting options for emerging computing systems. In
this article, using a comprehensive temperature
dependent model of an HfO$_x$ (hafnium-oxide)
memristor, based on experimental measurements, we
explore the best region of operation for a memristive
crossbar PUF (XbarPUF). The design considered also
employs XORing and a column shuffling technique to
improve reliability and resilience to machine learning
attacks. We present a detailed analysis for the noise
margin and discuss the scalability of the XbarPUF
structure. Finally, we present results for estimates of
area, power, and delay alongside security performance
metrics to analyze the strengths and weaknesses of the
XbarPUF. Our XbarPUF exhibits nearly ideal (near 50\%)
uniqueness, bit-aliasing and uniformity, good
reliability of 90\% and up (with 100\% being ideal), a
very small footprint, and low average power consumption
$ \approx 104 \mu $W.",
acknowledgement = ack-nhfb,
articleno = "2",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Yu:2018:SOF,
author = "Ye Yu and Niraj K. Jha",
title = "Statistical Optimization of {FinFET} Processor
Architectures under {PVT} Variations Using Dual
Device-Type Assignment",
journal = j-JETC,
volume = "14",
number = "1",
pages = "3:1--3:??",
month = mar,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3110714",
ISSN = "1550-4832",
bibdate = "Thu Nov 1 16:44:40 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "With semiconductor technology scaling to the 22nm node
and beyond, fin field-effect transistor (FinFET) has
started replacing complementary metal-oxide
semiconductor (CMOS), thanks to its superior control of
short-channel effects and much lower leakage current.
However, process, supply voltage, and temperature (PVT)
variations across the integrated circuit (IC) become
worse with technology scaling. Thus, to analyze timing,
leakage power, and dynamic power under PVT variations,
statistical analysis/optimization techniques are more
suitable than traditional static timing/power analysis
and optimization counterparts. In this article, we
propose a statistical optimization framework using dual
device-type assignment at the architecture level under
PVT variations that takes spatial correlations into
account and leverages circuit-level statistical
analysis techniques. To the best of our knowledge, this
is the first work to study statistical optimization at
the system level under PVT variations. Simulation
results show that leakage power yield and dynamic power
yield at the mean value of the baseline can be improved
by up to 44.2\% and 21.2\%, respectively, with no loss
in timing yield for a single-core processor and up to
43.0\% and 50.0\%, respectively, without any loss in
timing yield for an 8-core chip multiprocessor (CMP),
at little area overhead. Under the same (99.0\%) power
yield constraints, leakage power and dynamic power are
reduced by up to 91.2\% and 4.3\%, respectively, for a
single-core processor, and up to 44.6\% and 12.5\%,
respectively, for an 8-core CMP, with no loss in timing
yield. We also show that optimizations performed
without taking module-to-module and core-to-core
spatial correlations into account overestimate yield,
establishing the importance of taking such correlations
into account.",
acknowledgement = ack-nhfb,
articleno = "3",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Hajkazemi:2018:HHM,
author = "Mohammad Hossein Hajkazemi and Mohammad Khavari Tavana
and Tinoosh Mohsenin and Houman Homayoun",
title = "Heterogeneous {HMC + DDRx} Memory Management for
Performance-Temperature Tradeoffs",
journal = j-JETC,
volume = "14",
number = "1",
pages = "4:1--4:??",
month = mar,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3106233",
ISSN = "1550-4832",
bibdate = "Thu Nov 1 16:44:40 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Three-dimensional DRAMs (3D-DRAMs) are emerging as a
promising solution to address the memory wall problem
in computer systems. However, high fabrication cost per
bit and thermal issues are the main reasons that
prevent architects from using 3D-DRAM alone as the main
memory building block. In this article, we address this
issue by proposing a heterogeneous memory system that
combines a double data rate (DDRx) DRAM with an
emerging 3D hybrid memory cube (HMC) technology.
Bandwidth and temperature management are the
challenging issues for this heterogeneous memory
architecture. To address these challenges, first we
introduce a memory page allocation policy for the
heterogeneous memory system to maximize performance.
Then, using the proposed policy, we introduce a
temperature-aware algorithm that dynamically
distributes the requested bandwidth between HMC and
DDRx DRAM to reduce the thermal hotspot while
maintaining high performance. We take into account the
impact of both core count and HMC channel count on
performance while using the proposed policies. The
results show that the proposed memory page allocation
policy can utilize the memory bandwidth close to 99\%
of the ideal bandwidth utilization. Moreover, our
temperate-aware bandwidth adaptation reduces the
average steady-state temperature of the HMC hotspot
across various workloads by 4.5 K while incurring 2.5\%
performance overhead.",
acknowledgement = ack-nhfb,
articleno = "4",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Bhattacharjee:2018:RFT,
author = "Sukanta Bhattacharjee and Debasis Mitra and Bhargab B.
Bhattacharya",
title = "Robust In-Field Testing of Digital Microfluidic
Biochips",
journal = j-JETC,
volume = "14",
number = "1",
pages = "5:1--5:??",
month = mar,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3123586",
ISSN = "1550-4832",
bibdate = "Thu Nov 1 16:44:40 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Microfluidic technology offers vast promise for
implementing biochemistry-on-chip with diverse
applications to clinical diagnosis, genome analysis,
drug design, and point-of-care testing. Among various
types of fluid-chips, droplet-based digital
microfluidic biochips (DMFBs), which consist of a
patterned array of controllable electrodes, provide the
advantage of programmability, ease of fluidic
operations, and versatile droplet mobility. However,
because of manufacturing or field defects, electrode
degradation, or dielectric breakdown, these chips may
suffer from incorrect fluidic behavior. Reliability of
fluidic operations is of utmost concern in DMFBs that
are used to perform safety-critical bio-protocols.
Various methods are deployed to test these devices,
either offline or being overlapped with bioassay
operations (termed as concurrent or in-field testing).
The main challenge of in-field testing lies in the fact
that the test must run concurrently with the execution
of the normal assay without hampering the correctness
of the latter. In prior work, optimal testing for
droplet mobility over all electrodes was formulated in
terms of finding either a Hamiltonian path or a
Eulerian path in an undirected graph that represents
the electrode-adjacency structure. Although these
models have been studied for offline testing, no such
effort was made in the area of concurrent testing. In
this work, we propose, for in-field application, an
SAT-based modeling and solution approach to find an
optimal test plan that can be used to check droplet
movement across the boundary between every pair of
adjacent electrodes, which is visited by the droplets
of the ongoing assay. The proposed method is robust and
determines a test solution successfully regardless of
the cover assay that is being executed concurrently.
Experiments on several real-life assays and other test
cases demonstrate the effectiveness of the method with
respect to test completion time.",
acknowledgement = ack-nhfb,
articleno = "5",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Yang:2018:IAC,
author = "Xiaokun Yang and Wujie Wen and Ming Fan",
title = "Improving {AES} Core Performance via an Advanced
{ASBUS} Protocol",
journal = j-JETC,
volume = "14",
number = "1",
pages = "6:1--6:??",
month = mar,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3110713",
ISSN = "1550-4832",
bibdate = "Thu Nov 1 16:44:40 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Security is becoming a de-facto requirement of
System-on-Chips (SoC), leading up to a significant
share of circuit design cost. In this article, we
propose an advanced SBUS protocol (ASBUS), to improve
the data feeding efficiency of the Advanced Encryption
Standard (AES) encrypted circuits. As a case study, the
direct memory access (DMA) combined with AES engine and
memory controller are implemented as our
design-under-test (DUT) using field-programmable gate
arrays (FPGA). The results show that our presented
ASBUS structure outperforms the AXI-based design for
cipher tests. As an example, the 32-bit ASBUS design
costs less in terms of hardware resources and achieves
higher throughput ($ 1.30 \times $) than the 32-bit AXI
implementation, and the dynamic energy consumed by the
ASBUS cipher test is reduced to 71.27\% compared with
the AXI test.",
acknowledgement = ack-nhfb,
articleno = "6",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Oneal:2018:RCS,
author = "Kenneth O'neal and Daniel Grissom and Philip Brisk",
title = "Resource-Constrained Scheduling for Digital
Microfluidic Biochips",
journal = j-JETC,
volume = "14",
number = "1",
pages = "7:1--7:??",
month = mar,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3093930",
ISSN = "1550-4832",
bibdate = "Thu Nov 1 16:44:40 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Digital microfluidics based on
electrowetting-on-dielectric technology is poised to
revolutionize many aspects of chemistry and
biochemistry through miniaturization, automation, and
software programmability. Digital microfluidic biochips
(DMFBs) offer ample spatial parallelism, which is then
exposed to the compiler. The first problem that a DMFB
compiler must solve is resource-constrained scheduling,
which is NP-complete. If the compiler is applied
off-line, then long-running algorithms that produce
solutions of high quality, such as iterative
improvement or branch-and-bound search, can be applied;
in an online context, where a biochemical reaction is
to be executed as soon as it is specified by the
programmer, heuristics that sacrifice solution quality
to attain a fast runtime are used. This article
describes in detail the algorithms and heuristics that
have been proposed for resource-constrained scheduling,
focusing on several recent contributions: path
scheduling and force-directed list scheduling. It also
discusses shortcomings and limitations of existing
optimal scheduling problem formulations based on
Integer Linear Programming and presents an updated
formulation that addresses these issues. The algorithms
are compared and evaluated on an extensive benchmark
suite of biochemical assays used for applications, such
as in vitro diagnostics, protein crystallization, and
automated sample preparation.",
acknowledgement = ack-nhfb,
articleno = "7",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Motaman:2018:IPV,
author = "Seyedhamidreza Motaman and Swaroop Ghosh and Jaydeep
Kulkarni",
title = "Impact of Process Variation on Self-Reference Sensing
Scheme and Adaptive Current Modulation for Robust
{STTRAM} Sensing",
journal = j-JETC,
volume = "14",
number = "1",
pages = "8:1--8:??",
month = mar,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3132577",
ISSN = "1550-4832",
bibdate = "Thu Nov 1 16:44:40 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Spin-Transfer-Torque RAM (STTRAM) is a promising
technology for high-density on-chip cache due to low
standby power and high speed. However, the process
variation of the Magnetic Tunnel Junction (MTJ) and
access transistor poses a serious challenge to sensing.
Nondestructive sensing suffers from reference
resistance variation, whereas destructive sensing
suffers from failures due to unoptimized selection of
data and reference currents. Furthermore, the sense
speed is tightly coupled with the reference/data
current requirement. In this work, we study the process
variation effect on a self-reference sensing scheme to
eliminate bit-to-bit process variation in MTJ
resistance. Read current modulation is proposed to
overcome the failures due to process variation.
Simulation results reveal $ < 0.01 \% $ failures at the
cost of 9ns sense time and 190$ \mu $W power
consumption.",
acknowledgement = ack-nhfb,
articleno = "8",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Catania:2018:IEE,
author = "Vincenzo Catania and Andrea Mineo and Salvatore
Monteleone and Maurizio Palesi and Davide Patti",
title = "Improving Energy Efficiency in Wireless
Network-on-Chip Architectures",
journal = j-JETC,
volume = "14",
number = "1",
pages = "9:1--9:??",
month = mar,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3138807",
ISSN = "1550-4832",
bibdate = "Thu Nov 1 16:44:40 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Wireless Network-on-Chip (WiNoC) represents a
promising emerging communication technology for
addressing the scalability limitations of future
manycore architectures. In a WiNoC, high-latency and
power-hungry long-range multi-hop communications can be
realized by performance- and energy-efficient
single-hop wireless communications. However, the energy
contribution of such wireless communication accounts
for a significant fraction of the overall communication
energy budget. This article presents a novel energy
managing technique for WiNoC architectures aimed at
improving the energy efficiency of the main elements of
the wireless infrastructure, namely, radio-hubs. The
rationale behind the proposed technique is based on
selectively turning off, for the appropriate number of
cycles, all the radio-hubs that are not involved in the
current wireless communication. The proposed energy
managing technique is assessed on several network
configurations under different traffic scenarios both
synthetic and extracted from the execution of real
applications. The obtained results show that the
application of the proposed technique allows up to 25\%
total communication energy saving without any impact on
performance and with a negligible impact on the silicon
area of the radio-hub.",
acknowledgement = ack-nhfb,
articleno = "9",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Li:2018:ELC,
author = "Bohua Li and Yukui Pei and Wujie Wen",
title = "Efficient {LDPC} Code Design for Combating Asymmetric
Errors in {STT-RAM}",
journal = j-JETC,
volume = "14",
number = "1",
pages = "10:1--10:??",
month = mar,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3154836",
ISSN = "1550-4832",
bibdate = "Thu Nov 1 16:44:40 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Spin-transfer torque random access memory (STT-RAM) is
a promising emerging memory technology in the future
memory hierarchy. However, its unique reliability
challenges, i.e., the asymmetric bit failure mechanism
at different bit flippings, have raised significant
concerns in its real applications. Recent studies even
show that the common memory error repair ``remedies''
cannot efficiently address them. In this article, we
for the first time systematically study the potentials
of the strong low-density parity-check (LDPC) code for
combating such unique asymmetric errors in both
single-level-cell (SLC) and multi-level-cell (MLC)
STT-RAM designs. A generic STT-RAM channel model
suitable for the SLC/MLC designs, is developed to
analytically calibrate all the accumulated asymmetric
factors of the write/read operations. The key initial
information for LDPC decoding, namely asymmetric
log-likelihood ratio (A-LLR), is redesigned and
extracted from the proposed channel model, to unleash
the LDPC's asymmetric error correcting capability. LDPC
codec is also carefully designed to lower the hardware
cost by leveraging the systematic-structured parity
check matrix. Then two customized short-length LDPC
codes-(585,512) and (683,512)-augmented from the
semi-random parity check matrix and the A-LLR based
asymmetric decoding, are proposed for SLC and MLC
STT-RAM designs, respectively. Experiments show that
our proposed LDPC designs can improve the STT-RAM
reliability by at least 10$^2$ (10$^4$ ) when compared
to the existing error correction codes (ECCs) for the
SLC (MLC) design, demonstrating the feasibility of LDPC
solutions on STT-RAM.",
acknowledgement = ack-nhfb,
articleno = "10",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Liu:2018:OAE,
author = "Yu Liu and Yingyezhe Jin and Peng Li",
title = "Online Adaptation and Energy Minimization for Hardware
Recurrent Spiking Neural Networks",
journal = j-JETC,
volume = "14",
number = "1",
pages = "11:1--11:??",
month = mar,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3145479",
ISSN = "1550-4832",
bibdate = "Thu Nov 1 16:44:40 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "The Liquid State Machine (LSM) is a promising model of
recurrent spiking neural networks that provides an
appealing brain-inspired computing paradigm for
machine-learning applications such as pattern
recognition. Moreover, processing information directly
on spiking events makes the LSM well suited for cost-
and energy-efficient hardware implementation. In this
article, we systematically present three techniques for
optimizing energy efficiency while maintaining good
performance of the proposed LSM neural processors from
both an algorithmic and hardware implementation point
of view. First, to realize adaptive LSM neural
processors, thus boost learning performance, we propose
a hardware-friendly Spike-Timing Dependent Plastic
(STDP) mechanism for on-chip tuning. Then, the LSM
processor incorporates a novel runtime
correlation-based neuron gating scheme to minimize the
power dissipated by reservoir neurons. Furthermore, an
activity-dependent clock gating approach is presented
to address the energy inefficiency due to the
memory-intensive nature of the proposed neural
processors. Using two different real-world tasks of
speech and image recognition to benchmark, we
demonstrate that the proposed architecture boosts the
average learning performance by up to 2.0\% while
reducing energy dissipation by up to 29\% compared to a
baseline LSM with little extra hardware overhead on a
Xilinx Virtex-6 FPGA.",
acknowledgement = ack-nhfb,
articleno = "11",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Grani:2018:SPS,
author = "Paolo Grani and Sandro Bartolini",
title = "Scalable Path-Setup Scheme for All-Optical Dynamic
Circuit Switched {NoCs} in Cache Coherent {CMPs}",
journal = j-JETC,
volume = "14",
number = "1",
pages = "12:1--12:??",
month = mar,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3154840",
ISSN = "1550-4832",
bibdate = "Thu Nov 1 16:44:40 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Nanophotonics is a promising solution for on-chip
interconnection due to its intrinsic low-latency and
low-power features, which can be useful for performance
and energy in future Chip Multi-Processors (CMPs). This
article proposes a novel arbitrated all-optical
path-setup scheme for tiled CMPs adopting
circuit-switched optical networks. It aims at
significantly reducing path-setup latency and overall
energy consumption. The proposed arbitrated scheme is
able to configure multiple photonic switches
simultaneously, instead of sequentially as it is done
in state-of-the-art proposals. The proposed fast
optical path-setup solution reduces the overhead in
each transmission and, most importantly, allows optical
circuit-switched networks to effectively serve cache
coherence traffic, which is mainly composed of
relatively small messages. Specifically, we propose a
single-arbiter scheme where the whole topology is
managed by a central module (single-arbiter) that takes
care of the path-setup procedures. Then, to tackle
scalability, we propose a logically clustered
architecture (multi-arbiter) in which an arbiter is
allocated in each logical core-cluster and an ad hoc
distributed reservation protocol coordinates arbiters
to manage inter-cluster path reservations. We show that
our proposed single-arbiter architecture outperforms a
state-of-the-art optical network with sequential
path-setup (optical baseline) in the case of 8- and
16-core tiled CMP setups. However, due to serialization
issues, the single-arbiter solution is not able to
compete with a reference electronic baseline for bigger
32- and 64-core setups even if still performing much
better than the optical baseline. Conversely, our
multi-arbiter hierarchical solution allows us to
improve performance up to almost 20\% and 40\% for 32-
and 64-core setups, respectively, demonstrating a wide
applicability of the proposed technique. Energy-wise,
the analyzed solutions enable significant savings
compared to both the optical baseline with sequential
path setup, and to the electronic counterpart.
Specifically, results show more than 25\% average
improvement for the single-arbiter in the 8- and
16-core cases, and more than 40\% and 15\% savings for
the multi-arbiter in the 32- and 64-core cases,
respectively.",
acknowledgement = ack-nhfb,
articleno = "12",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{VanRynbach:2018:QCP,
author = "Andre {Van Rynbach} and Muhammad Ahsan and Jungsang
Kim",
title = "A Quantum Computing Performance Simulator Based on
Circuit Failure Probability and Fault Path Counting",
journal = j-JETC,
volume = "14",
number = "1",
pages = "13:1--13:??",
month = mar,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3154837",
ISSN = "1550-4832",
bibdate = "Thu Nov 1 16:44:40 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Quantum computing performance simulators are needed to
provide practical metrics for the effectiveness of
executing theoretical quantum information processing
protocols on physical hardware. In this work, we
present a tool to simulate the execution of
fault-tolerant quantum computation by automating the
tracking of common fault paths for error propagation
through an encoded circuit block and quantifying the
failure probability of each encoded qubit throughout
the circuit. Our simulator runs a fault path counter on
encoded circuit blocks to determine the probability
that two or more errors remain on the encoded qubits
after each block is executed, and it combines errors
from all the encoded blocks to estimate performance
metrics such as the logical qubit failure probability,
the overall circuit failure probability, the number of
qubits used, and the time required to run the overall
circuit. Our technique efficiently estimates the upper
bound of the error probability and provides a useful
measure of the error threshold at low error
probabilities where conventional Monte Carlo methods
are ineffective. We describe a way of simplifying the
fault-tolerant measurement process in the Steane code
to reduce the number of error correction steps
necessary. We present simulation results comparing the
execution of quantum adders, which constitute a major
part of Shor's algorithm.",
acknowledgement = ack-nhfb,
articleno = "13",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Cao:2018:GEI,
author = "Yu Cao and Xin Li and Jae-Sun Seo and Ganesh Dasika",
title = "{Guest Editors}' Introduction: Frontiers of Hardware
and Algorithms for On-chip Learning",
journal = j-JETC,
volume = "14",
number = "2",
pages = "14:1--14:??",
month = jul,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3205944",
ISSN = "1550-4832",
bibdate = "Thu Nov 1 16:44:40 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
acknowledgement = ack-nhfb,
articleno = "14",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Kim:2018:DNN,
author = "Hyungjun Kim and Taesu Kim and Jinseok Kim and
Jae-Joon Kim",
title = "Deep Neural Network Optimized to Resistive Memory with
Nonlinear Current-Voltage Characteristics",
journal = j-JETC,
volume = "14",
number = "2",
pages = "15:1--15:??",
month = jul,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3145478",
ISSN = "1550-4832",
bibdate = "Thu Nov 1 16:44:40 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Artificial Neural Network computation relies on
intensive vector-matrix multiplications. Recently, the
emerging nonvolatile memory (NVM) crossbar array showed
a feasibility of implementing such operations with high
energy efficiency. Thus, there have been many works on
efficiently utilizing emerging NVM crossbar arrays as
analog vector-matrix multipliers. However, nonlinear
I-V characteristics of NVM restrain critical design
parameters, such as the read voltage and weight range,
resulting in substantial accuracy loss. In this
article, instead of optimizing hardware parameters to a
given neural network, we propose a methodology of
reconstructing the neural network itself to be
optimized to resistive memory crossbar arrays. To
verify the validity of the proposed method, we
simulated various neural networks with MNIST and
CIFAR-10 dataset using two different Resistive Random
Access Memory models. Simulation results show that our
proposed neural network produces inference accuracies
significantly higher than conventional neural network
when the network is mapped to synapse devices with
nonlinear I-V characteristics.",
acknowledgement = ack-nhfb,
articleno = "15",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Sarwar:2018:EEN,
author = "Syed Shakib Sarwar and Swagath Venkataramani and
Aayush Ankit and Anand Raghunathan and Kaushik Roy",
title = "Energy-Efficient Neural Computing with Approximate
Multipliers",
journal = j-JETC,
volume = "14",
number = "2",
pages = "16:1--16:??",
month = jul,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3097264",
ISSN = "1550-4832",
bibdate = "Thu Nov 1 16:44:40 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Neural networks, with their remarkable ability to
derive meaning from a large volume of complicated or
imprecise data, can be used to extract patterns and
detect trends that are too complex for the von Neumann
computing paradigm. Their considerable computational
requirements stretch the capabilities of even modern
computing platforms. We propose an approximate
multiplier that exploits the inherent application
resilience to error and utilizes the notion of
computation sharing to achieve improved energy
consumption for neural networks. We also propose a
Multiplier-less Artificial Neuron (MAN), which is even
more compact and energy efficient. We also propose a
network retraining methodology to recover some of the
accuracy loss due to the use of these approximate
multipliers. We evaluated the proposed algorithm/design
on several recognition applications. The results show
that we achieve $ \approx $33\%, $ \approx $32\%, and $
\approx $25\% reduction in power consumption and $
\approx $33\%, $ \approx $34\%, and $ \approx $27\%
reduction in area, respectively, for 12-, 8-, and 4-bit
MAN, with a maximum $ \approx $2.4\% loss in accuracy
compared to a conventional neuron implementation of
equivalent bit precision. These comparisons were
performed under iso-speed conditions.",
acknowledgement = ack-nhfb,
articleno = "16",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Ko:2018:RTL,
author = "Glenn G. Ko and Rob A. Rutenbar",
title = "Real-Time and Low-Power Streaming Source Separation
Using {Markov} Random Field",
journal = j-JETC,
volume = "14",
number = "2",
pages = "17:1--17:??",
month = jul,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3183351",
ISSN = "1550-4832",
bibdate = "Thu Nov 1 16:44:40 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Machine learning (ML) has revolutionized a wide range
of recognition tasks, ranging from text analysis to
speech to vision, most notably in cloud deployments.
However, mobile deployment of these ideas involves a
very different category of design problems. In this
article, we develop a hardware architecture for a sound
source separation task, intended for deployment on a
mobile phone. We focus on a novel Markov random field
(MRF) sound source separation algorithm that uses
expectation-maximization and Gibbs sampling to learn
MRF parameters on the fly and infer the best separation
of sources. The intrinsically iterative algorithm
suggests challenges for both speed and power. A
real-time streaming FPGA implementation runs at 150MHz
with 207KB RAM, achieves a speed-up of $ 22 \times $
over a software reference, performs with an SDR of up
to 7.021dB with 1.601ms latency, and exhibits excellent
perceived audio quality. A 45nm CMOS ASIC virtual
prototype simulated at 20MHz shows that this
architecture is small ({$<$10} million gates) and
consumes only 70mW, which is less than 2\% of the power
of an ARM Cortex-A9 software version. To the best of
our knowledge, this is the first Gibbs sampling
inference accelerator designed in conventional
FPGA/ASIC technology that targets a realistic mobile
perceptual application.",
acknowledgement = ack-nhfb,
articleno = "17",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Li:2018:GOF,
author = "Yixing Li and Zichuan Liu and Kai Xu and Hao Yu and
Fengbo Ren",
title = "A {GPU}-Outperforming {FPGA} Accelerator Architecture
for Binary Convolutional Neural Networks",
journal = j-JETC,
volume = "14",
number = "2",
pages = "18:1--18:??",
month = jul,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3154839",
ISSN = "1550-4832",
bibdate = "Thu Nov 1 16:44:40 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "FPGA-based hardware accelerators for convolutional
neural networks (CNNs) have received attention due to
their higher energy efficiency than GPUs. However, it
is challenging for FPGA-based solutions to achieve a
higher throughput than GPU counterparts. In this
article, we demonstrate that FPGA acceleration can be a
superior solution in terms of both throughput and
energy efficiency when a CNN is trained with binary
constraints on weights and activations. Specifically,
we propose an optimized fully mapped FPGA accelerator
architecture tailored for bitwise convolution and
normalization that features massive spatial parallelism
with deep pipelines stages. A key advantage of the FPGA
accelerator is that its performance is insensitive to
data batch size, while the performance of GPU
acceleration varies largely depending on the batch size
of the data. Experiment results show that the proposed
accelerator architecture for binary CNNs running on a
Virtex-7 FPGA is $ 8.3 \times $ faster and $ 75 \times
$ more energy-efficient than a Titan X GPU for
processing online individual requests in small batch
sizes. For processing static data in large batch sizes,
the proposed solution is on a par with a Titan X GPU in
terms of throughput while delivering $ 9.5 \times $
higher energy efficiency.",
acknowledgement = ack-nhfb,
articleno = "18",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Potok:2018:SCD,
author = "Thomas E. Potok and Catherine Schuman and Steven Young
and Robert Patton and Federico Spedalieri and Jeremy
Liu and Ke-Thia Yao and Garrett Rose and Gangotree
Chakma",
title = "A Study of Complex Deep Learning Networks on
High-Performance, Neuromorphic, and Quantum Computers",
journal = j-JETC,
volume = "14",
number = "2",
pages = "19:1--19:??",
month = jul,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3178454",
ISSN = "1550-4832",
bibdate = "Thu Nov 1 16:44:40 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Current deep learning approaches have been very
successful using convolutional neural networks trained
on large graphical-processing-unit-based computers.
Three limitations of this approach are that (1) they
are based on a simple layered network topology, i.e.,
highly connected layers, without intra-layer
connections; (2) the networks are manually configured
to achieve optimal results, and (3) the implementation
of the network model is expensive in both cost and
power. In this article, we evaluate deep learning
models using three different computing architectures to
address these problems: quantum computing to train
complex topologies, high performance computing to
automatically determine network topology, and
neuromorphic computing for a low-power hardware
implementation. We use the MNIST dataset for our
experiment, due to input size limitations of current
quantum computers. Our results show the feasibility of
using the three architectures in tandem to address the
above deep learning limitations. We show that a quantum
computer can find high quality values of intra-layer
connection weights in a tractable time as the
complexity of the network increases, a high performance
computer can find optimal layer-based topologies, and a
neuromorphic computer can represent the complex
topology and weights derived from the other
architectures in low power memristive hardware.",
acknowledgement = ack-nhfb,
articleno = "19",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Xu:2018:SPC,
author = "Jiang Xu and Yuichi Nakamura and Andrew Kahng",
title = "Silicon Photonics for Computing Systems",
journal = j-JETC,
volume = "14",
number = "2",
pages = "20:1--20:??",
month = jul,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3208198",
ISSN = "1550-4832",
bibdate = "Thu Nov 1 16:44:40 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
acknowledgement = ack-nhfb,
articleno = "20",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Zhang:2018:LBT,
author = "Zhe Zhang and Yaoyao Ye",
title = "A Learning-Based Thermal-Sensitive Power Optimization
Approach for Optical {NoCs}",
journal = j-JETC,
volume = "14",
number = "2",
pages = "21:1--21:??",
month = jul,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3173468",
ISSN = "1550-4832",
bibdate = "Thu Nov 1 16:44:40 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Optical networks-on-chip (NoCs) based on silicon
photonics have been proposed as emerging on-chip
communication architectures for chip multiprocessors
with large core counts. However, due to the thermal
sensitivity of optical devices used in optical NoCs,
on-chip temperature variations cause significant
thermal-induced optical power loss, which would
counteract the power advantages of optical NoCs. To
tackle this problem, in this work, we propose a
learning-based thermal-sensitive power optimization
approach for mesh- or torus-based optical NoCs in the
presence of temperature variations. The key techniques
proposed include an initial device-setting and
thermal-tuning mechanism that is a device-level
optimization technique, and a learning-based
thermal-sensitive adaptive routing algorithm that is a
network-level optimization technique. Simulation
results of an 8x8 mesh-based optical NoC show that the
proposed initial device-setting and thermal-tuning
mechanism confines the worst-case thermal-induced
optical energy consumption to be on the order of tens
of pJ/bit, by avoiding significant thermal-induced
optical power loss caused by temperature-dependent
wavelength shifts. Besides, it shows that the
learning-based thermal-sensitive adaptive routing
algorithm is able to find an optimal path with the
minimum estimated thermal-induced optical power
consumption for each communication pair. The proposed
routing has a greater space for optimization,
especially for applications with more long-distance
traffic.",
acknowledgement = ack-nhfb,
articleno = "21",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Xu:2018:PVT,
author = "Yi Xu and Jun Yang and Rami Melhem",
title = "A Process-Variation-Tolerant Method for Nanophotonic
On-Chip Network",
journal = j-JETC,
volume = "14",
number = "2",
pages = "22:1--22:??",
month = jul,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3208073",
ISSN = "1550-4832",
bibdate = "Thu Nov 1 16:44:40 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Nanophotonic networks, a potential candidate for
future networks on-chip, have been challenged for their
reliability due to several device-level limitations.
One of the main issues is that fabrication errors
(a.k.a. process variations) can cause devices to
malfunction, rendering communication unreliable. For
example, the microring resonator, a preferred optical
modulator device, may not resonate at the designated
wavelength under process variations (PVs), leading to
communication errors and bandwidth loss. This article
proposes a series of solutions to the wavelength
drifting problem of microrings and subsequent bandwidth
loss problem of an optical network, due to PVs. The
objective is to maximize network bandwidth through
proper arrangement among microrings and wavelengths
with minimum power requirements. Our arrangement,
called ``MinTrim,'' solves this problem using simple
integer linear programming, adding supplementary
microrings, and allowing flexible assignment of
wavelengths to network nodes as long as the resulting
network presents maximal bandwidth. Each step is shown
to improve bandwidth provisioning with lower power
requirements. Evaluations on a sample network show that
a baseline network could lose more than 40\% bandwidth
due to PVs. Such loss can be recovered by MinTrim to
produce a network with 98.4\% working bandwidth. In
addition, the power required for arranging microrings
is 39\% lower than the baseline. Therefore, MinTrim
provides an efficient PV-tolerant solution to improving
the reliability of on-chip photonics.",
acknowledgement = ack-nhfb,
articleno = "22",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Fusella:2018:RPC,
author = "Edoardo Fusella and Alessandro Cilardo",
title = "Reducing Power Consumption of Lasers in Photonic
{NoCs} through Application-Specific Mapping",
journal = j-JETC,
volume = "14",
number = "2",
pages = "23:1--23:??",
month = jul,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3173463",
ISSN = "1550-4832",
bibdate = "Thu Nov 1 16:44:40 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "To face the complex communication problems that arise
as the number of on-chip components grows up, photonic
networks-on-chip (NoCs) have been recently proposed to
replace electronic interconnects. However, photonic
NoCs lack efficient laser sources, possibly resulting
in an inefficient or inoperable architecture. In this
article, we introduce a methodology for the design
space exploration of optical NoC mapping solutions,
which automatically assigns IPs/cores to the network
tiles such that the laser power consumption is
minimized. The experimental evaluation shows average
reductions of 34.7\% and 27.3\% in the power
consumption compared to, respectively,
application-oblivious and randomly mapped photonic
NoCs, allowing improved energy efficiency.",
acknowledgement = ack-nhfb,
articleno = "23",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Luo:2018:OOW,
author = "Jiating Luo and Cedric Killian and Sebastien {Le Beux}
and Daniel Chillet and Olivier Sentieys and Ian
O'Connor",
title = "Offline Optimization of Wavelength Allocation and
Laser Power in Nanophotonic Interconnects",
journal = j-JETC,
volume = "14",
number = "2",
pages = "24:1--24:??",
month = jul,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3178453",
ISSN = "1550-4832",
bibdate = "Thu Nov 1 16:44:40 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "Optical Network-on-Chip (ONoC) is a promising
communication medium for large-scale multiprocessor
systems-on-chips. Indeed, ONoC can outperform classical
electrical NoCs in terms of energy efficiency and
bandwidth density, in particular, because this medium
can support multiple transactions at the same time on
different wavelengths by using Wavelength Division
Multiplexing (WDM). However, multiple signals sharing
simultaneously the same part of a waveguide can lead to
inter-channel crosstalk noise. This problem impacts the
signal-to-noise ratio of the optical signals, which
leads to an increase in the Bit Error Rate (BER) at the
receiver side. If a specific BER is targeted, an
increase of laser power should be necessary to satisfy
the SNR. In this context, an important issue is to
evaluate the laser power needed to satisfy the various
desired communication bandwidths based on the BER
performance requirements. In this article, we propose
an off-line approach that concurrently optimizes the
laser power scaling and execution time of a global
application. A set of different levels of power is
introduced for each laser, to ensure that optical
signals can be emitted with just-enough power to ensure
targeted BER. As a result, most promising solutions are
highlighted for mapping a defined application onto a
16-core ring-based WDM ONoC.",
acknowledgement = ack-nhfb,
articleno = "24",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Vanwinkle:2018:SSH,
author = "Scott Vanwinkle and Avinash Karanth Kodi",
title = "{SHARP}: Shared Heterogeneous Architecture with
Reconfigurable Photonic Network-on-Chip",
journal = j-JETC,
volume = "14",
number = "2",
pages = "25:1--25:??",
month = jul,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3185383",
ISSN = "1550-4832",
bibdate = "Thu Nov 1 16:44:40 MDT 2018",
bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib",
abstract = "As the relentless quest for higher throughput and
lower energy cost continues in heterogeneous
multicores, there is a strong demand for
energy-efficient and high-performance Network-on-Chip
(NoC) architectures. Heterogeneous architectures that
can simultaneously utilize both the serialized nature
of the CPU as well as the thread level parallelism of
the GPU are gaining traction in the industry. A
critical issue with heterogeneous architectures is
finding an optimal way to utilize the shared resources
such as the last level cache and NoC without hindering
the performance of either the CPU or the GPU core.
Photonic interconnects are a disruptive technology
solution that has the potential to increase the
bandwidth, reduce latency, and improve
energy-efficiency over traditional metallic
interconnects. In this article, we propose a CPU-GPU
heterogeneous architecture called Shared Heterogeneous
Architecture with Reconfigurable Photonic
Network-on-Chip (SHARP) that clusters CPU and GPU cores
around the same router and dynamically allocates
bandwidth between the CPU and GPU cores based on
application demands. The SHARP architecture is designed
as a Single-Writer Multiple-Reader (SWMR) crossbar with
reservation-assist to connect CPU/GPU cores that
dynamically reallocates bandwidth using buffer
utilization information at runtime. As network traffic
exhibits temporal and spatial fluctuations due to
application behavior, SHARP can dynamically reallocate
bandwidth and thereby adapt to application demands.
SHARP demonstrates 34\% performance (throughput)
improvement over a baseline electrical CMESH while
consuming 25\% less energy per bit. Simulation results
have also shown 6.9\% to 14.9\% performance improvement
over other flavors of the proposed SHARP architecture
without dynamic bandwidth allocation.",
acknowledgement = ack-nhfb,
articleno = "25",
fjournal = "ACM Journal on Emerging Technologies in Computing
Systems (JETC)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967",
}
@Article{Ishihara:2018:INP,
author = "Tohru Ishihara and Akihiko Shinya and