Valid HTML 4.0! Valid CSS!
%%% -*-BibTeX-*-
%%% ====================================================================
%%% BibTeX-file{
%%%     author          = "Nelson H. F. Beebe",
%%%     version         = "1.49",
%%%     date            = "20 March 2024",
%%%     time            = "07:27:45 MST",
%%%     filename        = "trets.bib",
%%%     address         = "University of Utah
%%%                        Department of Mathematics, 110 LCB
%%%                        155 S 1400 E RM 233
%%%                        Salt Lake City, UT 84112-0090
%%%                        USA",
%%%     telephone       = "+1 801 581 5254",
%%%     FAX             = "+1 801 581 4148",
%%%     URL             = "https://www.math.utah.edu/~beebe",
%%%     checksum        = "02606 20576 105872 1019069",
%%%     email           = "beebe at math.utah.edu, beebe at acm.org,
%%%                        beebe at computer.org (Internet)",
%%%     codetable       = "ISO/ASCII",
%%%     keywords        = "ACM Transactions on Reconfigurable Technology
%%%                        and Systems; bibliography; TRETS",
%%%     license         = "public domain",
%%%     supported       = "yes",
%%%     docstring       = "This is a COMPLETE BibTeX bibliography for
%%%                        ACM Transactions on Reconfigurable Technology
%%%                        and Systems (CODEN ????, ISSN 1936-7406
%%%                        (print), 1936-7414 (electronic)), covering
%%%                        all journal issues from 2008 -- date.
%%%
%%%                        At version 1.49, the COMPLETE journal
%%%                        coverage looked like this:
%%%
%%%                             2008 (  17)    2014 (  27)    2020 (  21)
%%%                             2009 (  33)    2015 (  44)    2021 (  20)
%%%                             2010 (  37)    2016 (  29)    2022 (  52)
%%%                             2011 (  29)    2017 (  20)    2023 (  64)
%%%                             2012 (  22)    2018 (  28)    2024 (  18)
%%%                             2013 (  19)    2019 (  20)
%%%
%%%                             Article:        500
%%%
%%%                             Total entries:  500
%%%
%%%                        The journal table of contents page is at:
%%%
%%%                            http://www.acm.org/trets/
%%%                            http://portal.acm.org/toc.cfm?id=J1151
%%%
%%%                        Qualified subscribers can retrieve the full
%%%                        text of recent articles in PDF form.
%%%
%%%                        The initial draft was extracted from the ACM
%%%                        Web pages.
%%%
%%%                        ACM copyrights explicitly permit abstracting
%%%                        with credit, so article abstracts, keywords,
%%%                        and subject classifications have been
%%%                        included in this bibliography wherever
%%%                        available.  Article reviews have been
%%%                        omitted, until their copyright status has
%%%                        been clarified.
%%%
%%%                        bibsource keys in the bibliography entries
%%%                        below indicate the entry originally came
%%%                        from the computer science bibliography
%%%                        archive, even though it has likely since
%%%                        been corrected and updated.
%%%
%%%                        URL keys in the bibliography point to
%%%                        World Wide Web locations of additional
%%%                        information about the entry.
%%%
%%%                        BibTeX citation tags are uniformly chosen
%%%                        as name:year:abbrev, where name is the
%%%                        family name of the first author or editor,
%%%                        year is a 4-digit number, and abbrev is a
%%%                        3-letter condensation of important title
%%%                        words. Citation tags were automatically
%%%                        generated by software developed for the
%%%                        BibNet Project.
%%%
%%%                        In this bibliography, entries are sorted in
%%%                        publication order, using ``bibsort -byvolume.''
%%%
%%%                        The checksum field above contains a CRC-16
%%%                        checksum as the first value, followed by the
%%%                        equivalent of the standard UNIX wc (word
%%%                        count) utility output of lines, words, and
%%%                        characters.  This is produced by Robert
%%%                        Solovay's checksum utility."
%%%     }
%%% ====================================================================
@Preamble{"\input bibnames.sty" #
    "\ifx \undefined \circled \def \circled   #1{(#1)}        \fi" #
    "\ifx \undefined \pkg     \def \pkg       #1{{{\tt #1}}}  \fi" #
    "\ifx \undefined \reg     \def \reg         {\circled{R}} \fi" #
    "\def \TM {${}^{\sc TM}$}"
}

%%% ====================================================================
%%% Acknowledgement abbreviations:
@String{ack-nhfb = "Nelson H. F. Beebe,
                    University of Utah,
                    Department of Mathematics, 110 LCB,
                    155 S 1400 E RM 233,
                    Salt Lake City, UT 84112-0090, USA,
                    Tel: +1 801 581 5254,
                    FAX: +1 801 581 4148,
                    e-mail: \path|beebe@math.utah.edu|,
                            \path|beebe@acm.org|,
                            \path|beebe@computer.org| (Internet),
                    URL: \path|https://www.math.utah.edu/~beebe/|"}

%%% ====================================================================
%%% Journal abbreviations:
@String{j-TRETS                 = "ACM Transactions on Reconfigurable Technology
                                  and Systems (TRETS)"}

%%% ====================================================================
%%% Bibliography entries:
@Article{Buell:2008:I,
  author =       "Duncan Buell and Wayne Luk",
  title =        "Introduction",
  journal =      j-TRETS,
  volume =       "1",
  number =       "1",
  pages =        "1:1--1:??",
  month =        mar,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1331897.1331898",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Nov 4 17:12:41 MST 2008",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "1",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{DeHon:2008:GET,
  author =       "Andr{\'e} DeHon and Mike Hutton",
  title =        "Guest Editorial: {TRETS} Special Edition on the {15th
                 International Symposium on FPGAs}",
  journal =      j-TRETS,
  volume =       "1",
  number =       "1",
  pages =        "2:1--2:??",
  month =        mar,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1331897.1341292",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Nov 4 17:12:41 MST 2008",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "2",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Matsumoto:2008:SID,
  author =       "Yohei Matsumoto and Masakazu Hioki and Takashi
                 Kawanami and Hanpei Koike and Toshiyuki Tsutsumi and
                 Tadashi Nakagawa and Toshihiro Sekigawa",
  title =        "Suppression of Intrinsic Delay Variation in {FPGAs}
                 using Multiple Configurations",
  journal =      j-TRETS,
  volume =       "1",
  number =       "1",
  pages =        "3:1--3:??",
  month =        mar,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1331897.1331899",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Nov 4 17:12:41 MST 2008",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "A new method for improving the timing yield of
                 field-programmable gate array (FPGA) devices affected
                 by intrinsic within-die variation is proposed. The
                 timing variation is reduced by selecting an appropriate
                 configuration for each chip from a set of independent
                 configurations, the critical paths of which do not
                 share the same circuit resources on the FPGA. In this
                 article, the actual method used to generate independent
                 multiple configurations by simply repeating the routing
                 phase is shown, along with the results of Monte Carlo
                 simulation with 10,000 samples. One simulation result
                 showed that the standard deviations of maximum critical
                 path delays are reduced by 28\% and 49\% for 10\% and
                 30\% V$_{th}$ variations ($ \sigma / \mu $ ),
                 respectively, with 10 independent configurations.
                 Therefore, the proposed method is especially effective
                 for larger V$_{th}$ variation and is expected to be
                 useful for suppressing the performance variation of
                 FPGAs due to the future increase of parameter
                 variation. Another simulation result showed that the
                 effectiveness of the proposed technique was saturated
                 at the use of 10 or more configurations because of the
                 degradation of the quality of the configurations.
                 Therefore, the use of 10 or fewer configurations is
                 reasonable.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "3",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
  keywords =     "configuration; FPGA; timing yield; within-die
                 variation",
}

@Article{Sivaswamy:2008:SAP,
  author =       "Satish Sivaswamy and Kia Bazargan",
  title =        "Statistical Analysis and Process Variation-Aware
                 Routing and Skew Assignment for {FPGAs}",
  journal =      j-TRETS,
  volume =       "1",
  number =       "1",
  pages =        "4:1--4:??",
  month =        mar,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1331897.1331900",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Nov 4 17:12:41 MST 2008",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "With constant scaling of process technologies, chip
                 design is becoming increasingly difficult due to
                 process variations. The FPGA community has only
                 recently started focusing on the effects of variations.
                 In this work we present a statistical analysis to
                 compare the effects of variations on designs mapped to
                 FPGAs and ASICs. We also present CAD and architecture
                 techniques to mitigate the impact of variations. First
                 we present a variation-aware router that optimizes
                 statistical criticality. We then propose a modification
                 to the clock network to deliver programmable skews to
                 different flip-flops. Finally, we combine the two
                 techniques and the result is a 9x reduction in yield
                 loss that translates to a 12\% improvement in timing
                 yield. When the desired timing yield is set to 99\%,
                 our combined statistical routing and skew assignment
                 technique results in a delay improvement of about 10\%
                 over a purely deterministic approach.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "4",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
  keywords =     "routing; skew assignment; statistical timing
                 analysis",
}

@Article{Lu:2008:DCR,
  author =       "Shih-Lien L. Lu and Peter Yiannacouras and Taeweon Suh
                 and Rolf Kassa and Michael Konow",
  title =        "A Desktop Computer with a Reconfigurable
                 {Pentium\reg}",
  journal =      j-TRETS,
  volume =       "1",
  number =       "1",
  pages =        "5:1--5:??",
  month =        mar,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1331897.1331901",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Nov 4 17:12:41 MST 2008",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Advancements in reconfigurable technologies,
                 specifically FPGAs, have yielded faster, more
                 power-efficient reconfigurable devices with enormous
                 capacities. In our work, we provide testament to the
                 impressive capacity of recent FPGAs by hosting a
                 complete Pentium$^{{\reg }}$ in a single FPGA chip. In
                 addition we demonstrate how FPGAs can be used for
                 microprocessor design space exploration while
                 overcoming the tension between simulation speed, model
                 accuracy, and model completeness found in traditional
                 software simulator environments. Specifically, we
                 perform preliminary experimentation/prototyping with an
                 original Socket 7 based desktop processor system with
                 typical hardware peripherals running modern operating
                 systems such as Fedora Core 4 and Windows XP; however
                 we have inserted a Xilinx Virtex-4 in place of the
                 processor that should sit in the motherboard and have
                 used the Virtex-4 to host a complete version of the
                 Pentium$^{{\reg }}$ microprocessor (which consumes less
                 than half its resources). We can therefore apply
                 architectural changes to the processor and evaluate
                 their effects on the complete desktop system. We use
                 this FPGA-based emulation system to conduct preliminary
                 architectural experiments including growing the branch
                 target buffer and the level 1 caches. In addition, we
                 experimented with interfacing hardware accelerators
                 such as DES and AES engines which resulted in a 27x
                 speedup.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "5",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
  keywords =     "accelerator; architecture; emulator; exploration;
                 FPGA; model; operating system; Pentium processor;
                 reconfigurable; simulator",
}

@Article{Feng:2008:DEI,
  author =       "Wenyi Feng and Sinan Kaptanoglu",
  title =        "Designing Efficient Input Interconnect Blocks for
                 {LUT} Clusters Using Counting and Entropy",
  journal =      j-TRETS,
  volume =       "1",
  number =       "1",
  pages =        "6:1--6:??",
  month =        mar,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1331897.1331902",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Nov 4 17:12:41 MST 2008",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "In a cluster-based FPGA, the interconnect from
                 external routing tracks and cluster feedbacks to the
                 LUT inputs consumes significant area, and no consensus
                 has emerged among different implementations (e.g.,
                 1-level or 2-level). In this paper, we model this
                 interconnect as a unified input interconnect block
                 (IIB). We identify three types of IIBs and develop
                 general combinatorial techniques to count the number of
                 distinct functional configurations for them. We use
                 entropy, defined as the logarithm of this count, to
                 estimate an IIB's routing flexibility. This enables us
                 to analytically evaluate different IIBs without the
                 customary time-consuming place and route experiments.
                 We show that both depopulated 1-level IIBs and
                 VPR-style 2-level IIBs achieve high routing flexibility
                 but lack area efficiency. We propose a novel class of
                 highly efficient, yet still simple, IIBs that use
                 substantially fewer switches with only a small
                 degradation in routing flexibility. Experimental
                 results verify the routability of these IIBs, and
                 confirm that entropy is a good predictor of
                 routability.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "6",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
  keywords =     "cluster; counting; entropy; FPGAs; interconnect; LUT;
                 PLDs",
}

@Article{Wilton:2008:SDO,
  author =       "Steven J. E. Wilton and Chun Hok Ho and Bradley
                 Quinton and Philip H. W. Leong and Wayne Luk",
  title =        "A Synthesizable Datapath-Oriented Embedded {FPGA}
                 Fabric for Silicon Debug Applications",
  journal =      j-TRETS,
  volume =       "1",
  number =       "1",
  pages =        "7:1--7:??",
  month =        mar,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1331897.1331903",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Nov 4 17:12:41 MST 2008",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "We present an architecture for a synthesizable
                 datapath-oriented FPGA core that can be used to provide
                 post-fabrication flexibility to an SoC. Our
                 architecture is optimized for bus-based operations and
                 employs a directional routing architecture, which
                 allows it to be synthesized using standard ASIC design
                 tools and flows. The primary motivation for this
                 architecture is to provide an efficient mechanism to
                 support on-chip debugging. The fabric can also be used
                 to implement other datapath-oriented circuits such as
                 those needed in signal processing and
                 computation-intensive applications. We evaluate our
                 architecture using a set of benchmark circuits and
                 compare it to previous fabrics in terms of area, speed,
                 and power.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "7",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
  keywords =     "Field programmable gate array; integrated circuit;
                 silicon debug; system-on-chip",
}

@Article{Guneysu:2008:SPH,
  author =       "Tim G{\"u}neysu and Christof Paar and Jan Pelzl",
  title =        "Special-Purpose Hardware for Solving the Elliptic
                 Curve Discrete Logarithm Problem",
  journal =      j-TRETS,
  volume =       "1",
  number =       "2",
  pages =        "8:1--8:??",
  month =        jun,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1371579.1371580",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Nov 4 17:12:42 MST 2008",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "The resistance against powerful index-calculus attacks
                 makes Elliptic Curve Cryptosystems (ECC) an interesting
                 alternative to conventional asymmetric cryptosystems,
                 like RSA. Operands in ECC require significantly less
                 bits at the same level of security, resulting in a
                 higher computational efficiency compared to RSA. With
                 growing computational capabilities and continuous
                 technological improvements over the years, however, the
                 question of the security of ECC against attacks based
                 on special-purpose hardware arises. In this context,
                 recently emerged low-cost FPGAs demand for attention in
                 the domain of hardware-based cryptanalysis: the
                 extraordinary efficiency of modern programmable
                 hardware devices allow for a low-budget implementation
                 of hardware-based ECC attacks---without the requirement
                 of the expensive development of ASICs.\par

                 With focus on the aspect of cost-efficiency, this
                 contribution presents and analyzes an FPGA-based
                 architecture of an attack against ECC over prime
                 fields. A multi-processing hardware architecture for
                 Pollard's Rho method is described. We provide results
                 on actually used key lengths of ECC (128 bits and
                 above) and estimate the expected runtime for a
                 successful attack.\par

                 As a first result, currently used elliptic curve
                 cryptosystems with a security of 160 bit and above turn
                 out to be infeasible to break with available
                 computational and financial resources. However, some of
                 the security standards proposed by the Standards for
                 Efficient Cryptography Group (SECG) become subject to
                 attacks based on low-cost FPGAs.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "8",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
  keywords =     "cryptanalysis; discrete logarithm; elliptic curve
                 cryptosystem; Pollard's rho",
}

@Article{Jacob:2008:MBA,
  author =       "Arpith Jacob and Joseph Lancaster and Jeremy Buhler
                 and Brandon Harris and Roger D. Chamberlain",
  title =        "{Mercury BLASTP}: Accelerating Protein Sequence
                 Alignment",
  journal =      j-TRETS,
  volume =       "1",
  number =       "2",
  pages =        "9:1--9:??",
  month =        jun,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1371579.1371581",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Nov 4 17:12:42 MST 2008",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Large-scale protein sequence comparison is an
                 important but compute-intensive task in molecular
                 biology. BLASTP is the most popular tool for
                 comparative analysis of protein sequences. In recent
                 years, an exponential increase in the size of protein
                 sequence databases has required either exponentially
                 more running time or a cluster of machines to keep
                 pace. To address this problem, we have designed and
                 built a high-performance FPGA-accelerated version of
                 BLASTP, {\em Mercury BLASTP}. In this article, we
                 describe the architecture of the portions of the
                 application that are accelerated in the FPGA, and we
                 also describe the integration of these FPGA-accelerated
                 portions with the existing BLASTP software. We have
                 implemented Mercury BLASTP on a commodity workstation
                 with two Xilinx Virtex-II 6000 FPGAs. We show that the
                 new design runs 11--15 times faster than software
                 BLASTP on a modern CPU while delivering close to 99\%
                 identical results.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "9",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
  keywords =     "bioinformatics; biological sequence alignment",
}

@Article{Sedcole:2008:PYM,
  author =       "Pete Sedcole and Peter Y. K. Cheung",
  title =        "Parametric Yield Modeling and Simulations of {FPGA}
                 Circuits Considering Within-Die Delay Variations",
  journal =      j-TRETS,
  volume =       "1",
  number =       "2",
  pages =        "10:1--10:??",
  month =        jun,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1371579.1371582",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Nov 4 17:12:42 MST 2008",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Variations in the semiconductor fabrication process
                 results in differences in parameters between
                 transistors on the same die, a problem exacerbated by
                 lithographic scaling. Field-Programmable Gate Arrays
                 may be able to compensate for within-die delay
                 variability, by judicious use of reconfigurability.
                 This article presents two strategies for compensating
                 within-die stochastic delay variability by using
                 reconfiguration: reconfiguring the entire FPGA, and
                 relocating subcircuits within an FPGA. Analytical
                 models for the theoretical bounds on the achievable
                 gains are derived for both strategies and compared to
                 models for worst-case design as well as statistical
                 static timing analysis (SSTA). All models are validated
                 by comparison to circuit-level Monte Carlo simulations.
                 It is demonstrated that significant improvements in
                 circuit yield and timing are possible using SSTA alone,
                 and these improvements can be enhanced by employing
                 reconfiguration-based techniques.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "10",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
  keywords =     "delay; FPGA; modeling; process variation;
                 reconfiguration; statistical theory; within-die
                 variability; yield",
}

@Article{Gorjiara:2008:MDC,
  author =       "Bita Gorjiara and Mehrdad Reshadi and Daniel Gajski",
  title =        "Merged Dictionary Code Compression for {FPGA}
                 Implementation of Custom Microcoded {PEs}",
  journal =      j-TRETS,
  volume =       "1",
  number =       "2",
  pages =        "11:1--11:??",
  month =        jun,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1371579.1371583",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Nov 4 17:12:42 MST 2008",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Horizontal Microcoded Architecture (HMA) is a paradigm
                 for designing programmable high-performance processing
                 elements (PEs). However, it suffers from large code
                 size, which can be addressed by compression. In this
                 article, we study the code size of one of the new
                 HMA-based technologies called No-Instruction-Set
                 Computer (NISC). We show that NISC code size can be
                 several times larger than a typical RISC processor, and
                 we propose several low-overhead dictionary-based code
                 compression techniques to reduce its code size. Our
                 compression algorithm leverages the knowledge of
                 ``don't care'' values in the control words and can
                 reduce the code size by 3.3 times, on average. Despite
                 such good results, as shown in this article, these
                 compression techniques lead to poor FPGA
                 implementations because they require many on-chip RAMs.
                 To address this issue, we introduce an FPGA-aware
                 dictionary-based technique that uses the dual-port
                 feature of on-chip RAMs to reduce the number of
                 utilized block RAMs by half. Additionally, we propose
                 cascading two-levels of dictionaries for code size and
                 block RAM reduction of large programs. For an MP3
                 application, a merged, cascaded, three-dictionary
                 implementation reduces the number of utilized block
                 RAMs by 4.3 times (76\%) compared to a NISC without
                 compression. This corresponds to 20\% additional
                 savings over the best single level dictionary-based
                 compression.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "11",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
  keywords =     "dictionary based compression; FPGA; memory
                 optimization; microcoded architectures;
                 no-instruction-set computer",
}

@Article{Thomas:2008:MGR,
  author =       "David B. Thomas and Wayne Luk",
  title =        "Multivariate {Gaussian} Random Number Generation
                 Targeting Reconfigurable Hardware",
  journal =      j-TRETS,
  volume =       "1",
  number =       "2",
  pages =        "12:1--12:??",
  month =        jun,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1371579.1371584",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Nov 4 17:12:42 MST 2008",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/prng.bib;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "The multivariate Gaussian distribution is often used
                 to model correlations between stochastic time-series,
                 and can be used to explore the effect of these
                 correlations across $N$ time-series in Monte-Carlo
                 simulations. However, generating random correlated
                 vectors is an $ O(N^2) $ process, and quickly becomes a
                 computational bottleneck in software simulations. This
                 article presents an efficient method for generating
                 vectors in parallel hardware, using $N$ parallel
                 pipelined components to generate a new vector every $N$
                 cycles. This method maps well to the embedded block
                 RAMs and multipliers in contemporary FPGAs,
                 particularly as extensive testing shows that the
                 limited bit-width arithmetic does not reduce the
                 statistical quality of the generated vectors. An
                 implementation of the architecture in the Virtex-4
                 architecture achieves a 500MHz clock-rate, and can
                 support vector lengths up to 512 in the largest
                 devices. The combination of a high clock-rate and
                 parallelism provides a significant performance
                 advantage over conventional processors, with an
                 xc4vsx55 device at 500MHz providing a 200 times speedup
                 over an Opteron 2.6GHz using an AMD optimised BLAS
                 package. In a case study in Delta-Gamma Value-at Risk,
                 an RC2000 accelerator card using an xc4vsx55 at 400MHz
                 is 26 times faster than a quad Opteron 2.6GHz SMP.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "12",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
  keywords =     "FPGA; multivariate Gaussian distribution; random
                 numbers",
}

@Article{Lamoureux:2008:TBP,
  author =       "Julien Lamoureux and Steven J. E. Wilton",
  title =        "On the trade-off between power and flexibility of
                 {FPGA} clock networks",
  journal =      j-TRETS,
  volume =       "1",
  number =       "3",
  pages =        "13:1--13:??",
  month =        sep,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1391732.1391733",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Nov 4 17:12:44 MST 2008",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "FPGA clock networks consume a significant amount of
                 power, since they toggle every clock cycle and must be
                 flexible enough to implement the clocks for a wide
                 range of different applications. The efficiency of FPGA
                 clock networks can be improved by reducing this
                 flexibility; however, reducing the flexibility
                 introduces stricter constraints during the clustering
                 and placement stages of the FPGA CAD flow. These
                 constraints can reduce the overall efficiency of the
                 final implementation. This article examines the
                 trade-off between the power consumption and flexibility
                 of FPGA clock networks.\par

                 Specifically, this article makes three contributions.
                 First, it presents a new parameterized clock-network
                 framework for describing and comparing FPGA clock
                 networks. Second, it describes new clock-aware
                 placement techniques that are needed to find a legal
                 placement satisfying the constraints imposed by the
                 clock network. Finally, it performs an empirical study
                 to examine the trade-off between the power consumption
                 of the clock network and the impact of the CAD
                 constraints for a number of different clock networks
                 with varying amounts of flexibility.\par

                 The results show that the techniques used to produce a
                 legal placement can have a significant influence on
                 power and the ability of the placer to find a legal
                 solution. On average, circuits placed using the most
                 effective techniques dissipate 5\% less overall energy
                 and are significantly more likely to be legal than
                 circuits placed using other techniques. Moreover, the
                 results show that the architecture of the clock network
                 is also important. On average, FPGAs with an efficient
                 clock network are up to 14.6\% more energy efficient
                 compared to other FPGAs.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "13",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
  keywords =     "clock distribution networks; clock-aware placement;
                 FPGA; low-power design",
}

@Article{Slogsnat:2008:OSH,
  author =       "David Slogsnat and Alexander Giese and Mondrian
                 N{\"u}ssle and Ulrich Br{\"u}ning",
  title =        "An open-source {HyperTransport} core",
  journal =      j-TRETS,
  volume =       "1",
  number =       "3",
  pages =        "14:1--14:??",
  month =        sep,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1391732.1391734",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Nov 4 17:12:44 MST 2008",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "This article presents the design of a generic
                 HyperTransport (HT) core. HyperTransport is a
                 packet-based interconnect technology for low-latency,
                 high-bandwidth point-to-point connections. It is
                 specially optimized to achieve a very low latency. The
                 core has been verified in system using an FPGA. This
                 exhaustive verification and the generic design allow
                 the mapping to both ASICs and FPGAs. The implementation
                 described in this work supports a 16-bit link width, as
                 used by Opteron processors. On a Xilinx Virtex-4 FX60,
                 the core supports a link frequency of 400 MHz DDR and
                 offers a maximum bidirectional bandwidth of 3.2GB/s.
                 The in-system verification has been performed using a
                 custom FPGA board that has been plugged into a
                 HyperTransport extension connector (HTX) of a standard
                 Opteron-based motherboard. HTX slots in Opteron-based
                 motherboards allow very high-bandwidth, low-latency
                 communication, since the HTX device is directly
                 connected to one of the HyperTransport links of the
                 processor. Performance analysis shows a unidirectional
                 payload bandwidth of 1.4GB/s and a read latency of 180
                 ns. The HT core in combination with the HTX board is an
                 ideal base for prototyping systems and implementing
                 FPGA coprocessors. The HT core is available as open
                 source.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "14",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
  keywords =     "FPGA; HTX; HyperTransport; prototyping; RTL",
}

@Article{Beeckler:2008:PGR,
  author =       "John S. Beeckler and Warren J. Gross",
  title =        "Particle graphics on reconfigurable hardware",
  journal =      j-TRETS,
  volume =       "1",
  number =       "3",
  pages =        "15:1--15:??",
  month =        sep,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1391732.1391735",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Nov 4 17:12:44 MST 2008",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Particle graphics simulations are well suited for
                 modeling complex phenomena such as water, cloth,
                 explosions, fire, smoke, and clouds. They are normally
                 realized in software as part of an interactive graphics
                 application. The computational complexity of particle
                 graphics simulations restricts the number of particles
                 that can be updated in software at interactive frame
                 rates. This article presents the design and
                 implementation of a hardware particle graphics engine
                 for accelerating real-time particle graphics
                 simulations. We explore the design process,
                 implementation issues, and limitations of using
                 field-programmable gate arrays (FPGAs) for the
                 acceleration of particle graphics. The FPGA particle
                 engine processes million-particle systems at a rate
                 from 47 to 112 million particles per second, which
                 represents one to two orders of magnitude speedup over
                 a 2.8 GHz CPU. Using three FPGAs, a maximum sustained
                 performance of 112 million particles per second was
                 achieved.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "15",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
  keywords =     "FPGAs; particle systems; reconfigurable computing;
                 special-purpose architectures",
}

@Article{Grant:2008:PMS,
  author =       "David Grant and Guy Lemieux",
  title =        "Perturb $+$ mutate: Semisynthetic circuit generation
                 for incremental placement and routing",
  journal =      j-TRETS,
  volume =       "1",
  number =       "3",
  pages =        "16:1--16:??",
  month =        sep,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1391732.1391736",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Nov 4 17:12:44 MST 2008",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "CAD tool designers are always searching for more
                 benchmark circuits to stress their software. In this
                 article we present a heuristic method to generate
                 benchmark circuits specially suited for incremental
                 place-and-route tools. The method removes part of a
                 real circuit and replaces it with an altered version of
                 the same circuit to mimic an incremental design change.
                 The alteration consists of two steps: {\em mutate\/}
                 followed by {\em perturb}. The perturb step exactly
                 preserves as many circuit characteristics as possible.
                 While perturbing, reproduction of interconnect
                 locality, a characteristic that is difficult to measure
                 reliably or reproduce exactly, is controlled using a
                 new technique, {\em ancestor depth control\/} (ADC).
                 Perturbing with ADC produces circuits with postrouting
                 properties that match the best techniques known
                 to-date. The mutate step produces targetted mutations
                 resulting in controlled changes to specific circuit
                 properties (while keeping other properties constant).
                 We demonstrate one targetted mutation heuristic, scale,
                 to significantly change circuit size with little change
                 to other circuit characteristics. The method is simple
                 enough for inclusion in a CAD tool directly, and fast
                 enough for use in on-the-fly benchmark generation.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "16",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
  keywords =     "automated development tools; design automation; graph
                 algorithms; hardware-supporting software; place and
                 route; testing",
}

@Article{Hsiung:2008:PSB,
  author =       "Pao-Ann Hsiung and Chao-Sheng Lin and Chih-Feng Liao",
  title =        "{Perfecto}: a {SystemC}-based design-space exploration
                 framework for dynamically reconfigurable
                 architectures",
  journal =      j-TRETS,
  volume =       "1",
  number =       "3",
  pages =        "17:1--17:??",
  month =        sep,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1391732.1391737",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Nov 4 17:12:44 MST 2008",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "To cope with increasing demands for higher
                 computational power and greater system flexibility,
                 dynamically and partially reconfigurable logic has
                 started to play an important role in embedded systems
                 and systems-on-chip (SoC). However, when using
                 traditional design methods and tools, it is difficult
                 to estimate or analyze the performance impact of
                 including such reconfigurable logic devices into a
                 system design. In this work, we present a system-level
                 framework, called Perfecto, which is able to perform
                 rapid exploration of different reconfigurable design
                 alternatives and to detect system performance
                 bottlenecks. This framework is based on the popular
                 IEEE standard system-level design language SystemC,
                 which is supported by most EDA and ESL tools. Given an
                 architecture model and an application model, Perfecto
                 uses SystemC {\em transaction-level models\/} (TLMs) to
                 simulate the system design alternatives automatically.
                 Different hardware-software copartitioning,
                 coscheduling, and placement algorithms can be embedded
                 into the framework for analysis; thus, Perfecto can
                 also be used to design the algorithms to be used in an
                 operating system for reconfigurable systems.
                 Applications to a simple illustration example and a
                 network security system have shown how Perfecto helps a
                 designer make intelligent partition decisions, optimize
                 system performance, and evaluate task placements.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "17",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
  keywords =     "design-space exploration; partitioning; performance
                 evaluation; placement; reconfigurable systems;
                 scheduling",
}

@Article{Chin:2009:SDM,
  author =       "Scott Y. L. Chin and Steven J. E. Wilton",
  title =        "Static and Dynamic Memory Footprint Reduction for
                 {FPGA} Routing Algorithms",
  journal =      j-TRETS,
  volume =       "1",
  number =       "4",
  pages =        "18:1--18:??",
  month =        jan,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1462586.1462587",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Mon Jun 1 18:15:01 MDT 2009",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "This article presents techniques to reduce the static
                 and dynamic memory requirements of routing algorithms
                 that target field-programmable gate arrays. During
                 routing, memory is required to store both architectural
                 data and temporary routing data. The architectural data
                 is static, and provides a representation of the
                 physical routing resources and programmable connections
                 on the device. We show that by taking advantage of the
                 regularity in FPGAs, we can reduce the amount of
                 information that must be explicitly represented,
                 leading to significant memory savings. The temporary
                 routing data is dynamic, and contains scoring
                 parameters and traceback information for each routing
                 resource in the FPGA. By studying the lifespan of the
                 temporary routing data objects, we develop several
                 memory management schemes to reduce this component. To
                 make our proposals concrete, we applied them to the
                 routing algorithm in VPR and empirically quantified the
                 impact on runtime memory footprint, and place and route
                 time.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "18",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
  keywords =     "CAD; FPGA; memory; routing; scalability",
}

@Article{Xu:2009:FAR,
  author =       "Ning-Yi Xu and Xiong-Fei Cai and Rui Gao and Lei Zhang
                 and Feng-Hsiung Hsu",
  title =        "{FPGA} Acceleration of {RankBoost} in {Web} Search
                 Engines",
  journal =      j-TRETS,
  volume =       "1",
  number =       "4",
  pages =        "19:1--19:??",
  month =        jan,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1462586.1462588",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Mon Jun 1 18:15:01 MDT 2009",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Search relevance is a key measurement for the
                 usefulness of search engines. Shift of search relevance
                 among search engines can easily change a search
                 company's market cap by tens of billions of dollars.
                 With the ever-increasing scale of the Web, machine
                 learning technologies have become important tools to
                 improve search relevance ranking. RankBoost is a
                 promising algorithm in this area, but it is not widely
                 used due to its long training time. To reduce the
                 computation time for RankBoost, we designed a
                 FPGA-based accelerator system and its upgraded version.
                 The accelerator, plugged into a commodity PC, increased
                 the training speed on MSN search engine data up to
                 1800x compared to the original software implementation
                 on a server. The proposed accelerator has been
                 successfully used by researchers in the search
                 relevance ranking.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "19",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
  keywords =     "FPGA; hardware acceleration",
}

@Article{Patterson:2009:STP,
  author =       "C. D. Patterson and S. W. Ellingson and B. S. Martin
                 and K. Deshpande and J. H. Simonetti and M. Kavic and
                 S. E. Cutchin",
  title =        "Searching for Transient Pulses with the {ETA} Radio
                 Telescope",
  journal =      j-TRETS,
  volume =       "1",
  number =       "4",
  pages =        "20:1--20:??",
  month =        jan,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1462586.1462589",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Mon Jun 1 18:15:01 MDT 2009",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Array-based, direct-sampling radio telescopes have
                 computational and communication requirements unsuited
                 to conventional computer and cluster architectures.
                 Synchronization must be strictly maintained across a
                 large number of parallel data streams, from A/D
                 conversion, through operations such as beamforming, to
                 dataset recording. FPGAs supporting multigigabit serial
                 I/O are ideally suited to this application. We describe
                 a recently-constructed radio telescope called ETA
                 having all-sky observing capability for detecting low
                 frequency pulses from transient events such as gamma
                 ray bursts and primordial black hole explosions.
                 Signals from 24 dipole antennas are processed by a
                 tiered arrangement of 28 commercial FPGA boards and 4
                 PCs with FPGA-based data acquisition cards, connected
                 with custom I/O adapter boards supporting InfiniBand
                 and LVDS physical links. ETA is designed for unattended
                 operation, allowing configuration and recording to be
                 controlled remotely.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "20",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
  keywords =     "Direct sampling radio telescope array; FPGA cluster
                 computing; RFI mitigation; signal dedispersion",
}

@Article{El-Araby:2009:EPR,
  author =       "Esam El-Araby and Ivan Gonzalez and Tarek El-Ghazawi",
  title =        "Exploiting Partial Runtime Reconfiguration for
                 High-Performance Reconfigurable Computing",
  journal =      j-TRETS,
  volume =       "1",
  number =       "4",
  pages =        "21:1--21:??",
  month =        jan,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1462586.1462590",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Mon Jun 1 18:15:01 MDT 2009",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Runtime Reconfiguration (RTR) has been traditionally
                 utilized as a means for exploiting the flexibility of
                 High-Performance Reconfigurable Computers (HPRCs).
                 However, the RTR feature comes with the cost of high
                 configuration overhead which might negatively impact
                 the overall performance. Currently, modern FPGAs have
                 more advanced mechanisms for reducing the configuration
                 overheads, particularly Partial Runtime Reconfiguration
                 (PRTR). It has been perceived that PRTR on HPRC systems
                 can be the trend for improving the performance. In this
                 work, we will investigate the potential of PRTR on HPRC
                 by formally analyzing the execution model and
                 experimentally verifying our analytical findings by
                 enabling PRTR for the first time, to the best of our
                 knowledge, on one of the current HPRC systems, Cray
                 XD1. Our approach is general and can be applied to any
                 of the available HPRC systems. The paper will conclude
                 with recommendations and conditions, based on our
                 conceptual and experimental work, for the optimal
                 utilization of PRTR as well as possible future usage in
                 HPRC.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "21",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
  keywords =     "dynamic partial reconfiguration; field programmable
                 gate arrays (FPGA); High performance computing;
                 reconfigurable computing",
}

@Article{Holland:2009:RRA,
  author =       "Brian Holland and Karthik Nagarajan and Alan D.
                 George",
  title =        "{RAT}: {RC} Amenability Test for Rapid Performance
                 Prediction",
  journal =      j-TRETS,
  volume =       "1",
  number =       "4",
  pages =        "22:1--22:??",
  month =        jan,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1462586.1462591",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Mon Jun 1 18:15:01 MDT 2009",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "While the promise of achieving speedup and additional
                 benefits such as high performance per watt with FPGAs
                 continues to expand, chief among the challenges with
                 the emerging paradigm of reconfigurable computing is
                 the complexity in application design and
                 implementation. Before a lengthy development effort is
                 undertaken to map a given application to hardware, it
                 is important that a high-level parallel algorithm
                 crafted for that application first be analyzed relative
                 to the target platform, so as to ascertain the
                 likelihood of success in terms of potential speedup.
                 This article presents the RC Amenability Test, or RAT,
                 a methodology and model developed for this purpose,
                 supporting rapid exploration and prediction of
                 strategic design tradeoffs during the formulation stage
                 of application development.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "22",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
  keywords =     "formulation methodology; FPGA; performance prediction;
                 reconfigurable computing; strategic design
                 methodology",
}

@Article{Murtaza:2009:CBB,
  author =       "S. Murtaza and A. G. Hoekstra and P. M. A. Sloot",
  title =        "Compute Bound and {I/O} Bound Cellular Automata
                 Simulations on {FPGA} Logic",
  journal =      j-TRETS,
  volume =       "1",
  number =       "4",
  pages =        "23:1--23:??",
  month =        jan,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1462586.1462592",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Mon Jun 1 18:15:01 MDT 2009",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "FPGA-based computation engines have been used as
                 Cellular Automata accelerators in the scientific
                 community for some time now. With the recent
                 availability of more advanced FPGA logic it becomes
                 necessary to better understand the mapping of Cellular
                 Automata to these systems. There are many trade-offs to
                 consider when mapping a Cellular Automata algorithm
                 from an abstract system to the physical implementation
                 using FPGA logic. The trade-offs include both the
                 available FPGA resources and the Cellular Automata
                 algorithm's execution time. The most important aspect
                 is to fully understand the behavior of the specified CA
                 algorithm in terms of its execution times which are
                 either compute bound or I/O bound. In this article, we
                 present a methodology to categorize a specified CA
                 algorithm as a compute bound or an I/O bound. We take
                 the methodology further by presenting rigorous analysis
                 for each of the two cases identifying the various
                 parameters that control the mapping process and are
                 defined both by the Cellular Automata algorithm and the
                 given FPGA hardware specifications. This methodology
                 helps to predict the performance of running Cellular
                 Automata algorithms on specific FPGA hardware and to
                 determine optimal values for the various parameters
                 that control the mapping process. The model is
                 validated for both compute and I/O bound
                 two-dimensional Cellular Automata algorithms. We find
                 that our model predictions are accurate within 7\%.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "23",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
  keywords =     "cellular automata; FPGA-based hardware accelerator;
                 High-performance computing; lattice Boltzman
                 simulations",
}

@Article{Bouganis:2009:SOF,
  author =       "Christos-S. Bouganis and Sung-Boem Park and George A.
                 Constantinides and Peter Y. K. Cheung",
  title =        "Synthesis and Optimization of {$2$D} Filter Designs
                 for Heterogeneous {FPGAs}",
  journal =      j-TRETS,
  volume =       "1",
  number =       "4",
  pages =        "24:1--24:??",
  month =        jan,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1462586.1462593",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Mon Jun 1 18:15:01 MDT 2009",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Many image processing applications require fast
                 convolution of an image with one or more 2D filters.
                 Field-Programmable Gate Arrays (FPGAs) are often used
                 to achieve this goal due to their fine grain
                 parallelism and reconfigurability. However, the
                 heterogeneous nature of modern reconfigurable devices
                 is not usually considered during design optimization.
                 This article proposes an algorithm that explores the
                 space of possible implementation architectures of 2D
                 filters, targeting the minimization of the required
                 area, by optimizing the usage of the different
                 components in a heterogeneous device. This is achieved
                 by exploring the heterogeneous nature of modern
                 reconfigurable devices using a Singular Value
                 Decomposition based algorithm, which provides an
                 efficient mapping of filter's implementation
                 requirements to the heterogeneous components of modern
                 FPGAs. In the case of multiple 2D filters, the proposed
                 algorithm also exploits any redundancy that exists
                 within each filter and between different filters in the
                 set, leading to designs with minimized area.
                 Experiments with real filter sets from computer vision
                 applications demonstrate an average of up to 38\%
                 reduction in the required area.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "24",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
  keywords =     "2D filter design; FPGA; reconfigurable logic; Singular
                 Value Decomposition",
}

@Article{Schaumont:2009:GEI,
  author =       "Patrick R. Schaumont and Alex K. Jones and Steve
                 Trimberger",
  title =        "{Guest Editors}' Introduction to Security in
                 Reconfigurable Systems Design",
  journal =      j-TRETS,
  volume =       "2",
  number =       "1",
  pages =        "1:1--1:??",
  month =        mar,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1502781.1502782",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Mon Jun 1 18:15:27 MDT 2009",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "This special issue on Security in Reconfigurable
                 Systems Design reports on recent research results in
                 the design and implementation of trustworthy
                 reconfigurable systems. Five articles cover topics
                 including power-efficient implementation of public-key
                 cryptography, side-channel analysis of electromagnetic
                 radiation, side-channel resistant design, design of
                 robust unclonable functions on an FPGA, and Trojan
                 detection in an FPGA bitstream.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "1",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
  keywords =     "physically unclonable function; side-channel resistant
                 design; Trojan; Trustworthy design",
}

@Article{Keller:2009:ECC,
  author =       "Maurice Keller and Andrew Byrne and William P.
                 Marnane",
  title =        "Elliptic Curve Cryptography on {FPGA} for Low-Power
                 Applications",
  journal =      j-TRETS,
  volume =       "2",
  number =       "1",
  pages =        "2:1--2:??",
  month =        mar,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1502781.1502783",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Mon Jun 1 18:15:27 MDT 2009",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Elliptic curve cryptography has generated a lot of
                 research interest due to its ability to provide greater
                 security per bit compared to public key systems such as
                 RSA. The designer of an elliptic curve hardware
                 accelerator is faced with many choices at design time,
                 each of which can impact the performance of the
                 accelerator in different ways. There are many examples
                 in the literature of how these design choices can
                 effect the area and/or speed of an elliptic curve
                 hardware accelerator. The effect of design choices on
                 power and energy consumption in elliptic curve hardware
                 has been less well studied. This article studies the
                 effect of design choices on the power and energy
                 consumption of an FPGA-based reconfigurable elliptic
                 curve hardware accelerator. A reconfigurable processor
                 has been used for different system parameters and the
                 power and energy consumption measured. The power and
                 energy results are presented and compared.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "2",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
  keywords =     "Cryptography; elliptic curves; FPGA; low-power",
}

@Article{McEvoy:2009:IWH,
  author =       "Robert P. McEvoy and Colin C. Murphy and William P.
                 Marnane and Michael Tunstall",
  title =        "Isolated {WDDL}: a Hiding Countermeasure for
                 Differential Power Analysis on {FPGAs}",
  journal =      j-TRETS,
  volume =       "2",
  number =       "1",
  pages =        "3:1--3:??",
  month =        mar,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1502781.1502784",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Mon Jun 1 18:15:27 MDT 2009",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Security protocols are frequently accelerated by
                 implementing the underlying cryptographic functions in
                 reconfigurable hardware. However, unprotected hardware
                 implementations are susceptible to side-channel
                 attacks, and Differential Power Analysis (DPA) has been
                 shown to be especially powerful. In this work, we
                 evaluate and compare the effectiveness of common hiding
                 countermeasures against DPA in FPGA-based designs,
                 using the Whirlpool hash function as a case study. In
                 particular, we develop a new design flow called
                 Isolated WDDL (IWDDL). In contrast with previous works,
                 IWDDL isolates the direct and complementary circuit
                 paths, and also provides DPA resistance in the Hamming
                 distance power model. The analysis is supported using
                 actual implementation results.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "3",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
  keywords =     "DPA; FPGA; secure logic; Side-channel attacks;
                 Whirlpool",
}

@Article{Sauvage:2009:ERF,
  author =       "Laurent Sauvage and Sylvain Guilley and Yves Mathieu",
  title =        "Electromagnetic Radiations of {FPGAs}: High Spatial
                 Resolution Cartography and Attack on a Cryptographic
                 Module",
  journal =      j-TRETS,
  volume =       "2",
  number =       "1",
  pages =        "4:1--4:??",
  month =        mar,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1502781.1502785",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Mon Jun 1 18:15:27 MDT 2009",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Since the first announcement of a Side Channel
                 Analysis (SCA) about ten years ago, considerable
                 research has been devoted to studying these attacks on
                 Application Specific Integrated Circuits (ASICs), such
                 as smart cards or TPMs. In this article, we compare
                 power-line attacks with ElectroMagnetic (EM) attacks,
                 specifically targeting Field Programmable Gate Array
                 devices (FPGAs), as they are becoming widely used for
                 sensitive applications involving cryptography.\par

                 We show experimentally that ElectroMagnetic Analysis
                 (EMA) is always faster than the historical Differential
                 Power Analysis (DPA) in retrieving keys of symmetric
                 ciphers. In addition, these analyses prove to be very
                 convenient to conduct, as they are totally
                 non-invasive.\par

                 Research reports indicate that EMA can be conducted
                 globally, typically with macroscopic home-made coils
                 circling the device under attack, with fair results.
                 However, as accurate professional EM antennas are now
                 becoming more accessible, it has become commonplace to
                 carry out EM analyses locally.\par

                 Cartography has been carried out by optical means on
                 circuits realized with technology greater than 250
                 nanometers. Nonetheless, for deep submicron
                 technologies, the feature size of devices that are
                 spied upon is too small to be visible with photographic
                 techniques. In addition, the presence of the 6+
                 metallization layers obviously prevents a direct
                 observation of the layout. Therefore, EM imaging is
                 emerging as a relevant means to discover the underlying
                 device structure.\par

                 In this article, we present the first images of
                 deep-submicron FPGAs. The resolution is not as accurate
                 as photographic pictures: we notably compare the layout
                 of toy design examples placed at the four corners of
                 the FPGAs with the EM images we collected. We observe
                 that EM imaging has the advantage of revealing active
                 regions, which can be useful in locating a particular
                 processor (visible while active---invisible when
                 inactive).\par

                 In the context of EM attacks, we stress that the exact
                 localization of the cryptographic target is not
                 necessary: the coarse resolution we obtain is
                 sufficient. We note that the EM imaging does not reveal
                 the exact layout of the FPGA, but instead directly
                 guides the attacker towards the areas which are leaking
                 the most. We achieve attacks with an accurate sensor,
                 both far from (namely on a SMC capacitor on the board)
                 and close to (namely directly over the FPGA) the
                 encryption co-processor. As compared to the previously
                 published attacks, we report a successful attack on a
                 DES module in fewer than 6,300 measurements, which is
                 currently the best cracking performance against this
                 encryption algorithm implemented in FPGAs.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "4",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
  keywords =     "cartography; DPA; EMA; FPGA; SCA; security",
}

@Article{Majzoobi:2009:TDI,
  author =       "Mehrdad Majzoobi and Farinaz Koushanfar and Miodrag
                 Potkonjak",
  title =        "Techniques for Design and Implementation of Secure
                 Reconfigurable {PUFs}",
  journal =      j-TRETS,
  volume =       "2",
  number =       "1",
  pages =        "5:1--5:??",
  month =        mar,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1502781.1502786",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Mon Jun 1 18:15:27 MDT 2009",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Physically unclonable functions (PUFs) provide a basis
                 for many security and digital rights management
                 protocols. PUF-based security approaches have numerous
                 comparative strengths with respect to traditional
                 cryptography-based techniques, including resilience
                 against physical and side channel attacks and
                 suitability for lightweight protocols. However,
                 classical delay-based PUF structures have a number of
                 drawbacks including susceptibility to guessing, reverse
                 engineering, and emulation attacks, as well as
                 sensitivity to operational and environmental
                 variations.\par

                 To address these limitations, we have developed a new
                 set of techniques for FPGA-based PUF design and
                 implementation. We demonstrate how reconfigurability
                 can be exploited to eliminate the stated PUF
                 limitations. We also show how FPGA-based PUFs can be
                 used for privacy protection. Furthermore,
                 reconfigurability enables the introduction of new
                 techniques for PUF testing. The effectiveness of all
                 the proposed techniques is validated using extensive
                 implementations, simulations, and statistical
                 analysis.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "5",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
  keywords =     "hardware security; physically unclonable functions;
                 process variation; Reconfigurable systems",
}

@Article{Dutt:2009:TBD,
  author =       "Shantanu Dutt and Li Li",
  title =        "Trust-Based Design and Check of {FPGA} Circuits Using
                 Two-Level Randomized {ECC} Structures",
  journal =      j-TRETS,
  volume =       "2",
  number =       "1",
  pages =        "6:1--6:??",
  month =        mar,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1502781.1508209",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Mon Jun 1 18:15:27 MDT 2009",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "A novel trust-based design method for FPGA circuits
                 that uses error-correcting code (ECC) structures for
                 detecting design tampers (changes, deletion of existing
                 logic, and addition of extradesign logic-like Trojans)
                 is proposed in this article. We determine ECC-based CLB
                 (configuration logic block) parity groups and embed the
                 check CLBs for each parity group in the FPGA circuit.
                 During a trust-checking phase, a Test-Pattern Generator
                 (TPG) and an Output Response Analyzer (ORA), configured
                 in the FPGA, are used to check that each parity group
                 of CLB outputs produce the expected parities. We use
                 two levels of randomization to thwart attempts by an
                 adversary to discover the parity groups and inject
                 tampers that mask each other, or to tamper with the TPG
                 and ORA so that design tampers remain undetected: (a)
                 randomization of the mapping of the ECC parity groups
                 to the CLB array; (b) randomization within each parity
                 group of odd and even parities for different input
                 combinations (classically, all ECC parity groups have
                 even parities across all inputs). These randomizations
                 along with the error-detecting property of the
                 underlying ECC lead to design tampers being uncovered
                 with very high probabilities, as we show both
                 analytically and empirically. We also classify
                 different CLB function structures and impose a parity
                 group selection in which only similarly structured
                 functions are randomly selected to be in the same
                 parity group in order to minimize check function
                 complexity. Using the 2D code as our underlying ECC and
                 its 2-level randomization, our experiments with
                 inserting 1-10 circuit CLB tampers and 1-5 extraneous
                 logic CLBs in two medium-size circuits and a RISC
                 processor circuit implemented on a Xilinx Spartan-3
                 FPGA show promising results of 100\% tamper detection
                 and 0\% false alarms, obtained at a hardware overhead
                 of only 7-10\%.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "6",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
  keywords =     "Error-correcting codes; FPGAs; masking probability;
                 parity groups; parity randomization; trust checking;
                 trust-based design",
}

@Article{Amano:2009:GEI,
  author =       "Hideharu Amano and Tadao Nakamura",
  title =        "Guest editors' introduction: {ICFPT 2007}",
  journal =      j-TRETS,
  volume =       "2",
  number =       "2",
  pages =        "7:1--7:??",
  month =        jun,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1534916.1534917",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Mar 16 09:46:50 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "7",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Zhao:2009:TMB,
  author =       "Weisheng Zhao and Eric Belhaire and Claude Chappert
                 and Bernard Dieny and Guillaume Prenat",
  title =        "{TAS-MRAM}-Based Low-Power High-Speed Runtime
                 Reconfiguration {(RTR) FPGA}",
  journal =      j-TRETS,
  volume =       "2",
  number =       "2",
  pages =        "8:1--8:??",
  month =        jun,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1534916.1534918",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Mar 16 09:46:50 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "8",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Koch:2009:HDT,
  author =       "Dirk Koch and Christian Beckhoff and J{\"u}rgen
                 Teich",
  title =        "Hardware Decompression Techniques for {FPGA}-Based
                 Embedded Systems",
  journal =      j-TRETS,
  volume =       "2",
  number =       "2",
  pages =        "9:1--9:??",
  month =        jun,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1534916.1534919",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Mar 16 09:46:50 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "9",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Wong:2009:SMC,
  author =       "Justin S. J. Wong and Pete Sedcole and Peter Y. K.
                 Cheung",
  title =        "Self-Measurement of Combinatorial Circuit Delays in
                 {FPGAs}",
  journal =      j-TRETS,
  volume =       "2",
  number =       "2",
  pages =        "10:1--10:??",
  month =        jun,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1534916.1534920",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Mar 16 09:46:50 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "10",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Seetharaman:2009:ASF,
  author =       "G. Seetharaman and B. Venkataramani",
  title =        "Automation Schemes for {FPGA} Implementation of
                 Wave-Pipelined Circuits",
  journal =      j-TRETS,
  volume =       "2",
  number =       "2",
  pages =        "11:1--11:??",
  month =        jun,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1534916.1534921",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Mar 16 09:46:50 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "11",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Yu:2009:VPS,
  author =       "Jason Yu and Christopher Eagleston and Christopher
                 Han-Yu Chou and Maxime Perreault and Guy Lemieux",
  title =        "Vector Processing as a Soft Processor Accelerator",
  journal =      j-TRETS,
  volume =       "2",
  number =       "2",
  pages =        "12:1--12:??",
  month =        jun,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1534916.1534922",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Mar 16 09:46:50 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "12",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Cevrero:2009:FPC,
  author =       "Alessandro Cevrero and Panagiotis Athanasopoulos and
                 Hadi Parandeh-Afshar and Ajay K. Verma and Hosein Seyed
                 Attarzadeh Niaki and Chrysostomos Nicopoulos and Frank
                 K. Gurkaynak and Philip Brisk and Yusuf Leblebici and
                 Paolo Ienne",
  title =        "Field Programmable Compressor Trees: Acceleration of
                 Multi-Input Addition on {FPGAs}",
  journal =      j-TRETS,
  volume =       "2",
  number =       "2",
  pages =        "13:1--13:??",
  month =        jun,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1534916.1534923",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Mar 16 09:46:50 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "13",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Jang:2009:WFT,
  author =       "Stephen Jang and Billy Chan and Kevin Chung and Alan
                 Mishchenko",
  title =        "{WireMap}: {FPGA} Technology Mapping for Improved
                 Routability and Enhanced {LUT} Merging",
  journal =      j-TRETS,
  volume =       "2",
  number =       "2",
  pages =        "14:1--14:??",
  month =        jun,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1534916.1534924",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Mar 16 09:46:50 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "14",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Chung:2009:PTS,
  author =       "Eric S. Chung and Michael K. Papamichael and Eriko
                 Nurvitadhi and James C. Hoe and Ken Mai and Babak
                 Falsafi",
  title =        "{ProtoFlex}: Towards Scalable, Full-System
                 Multiprocessor Simulations Using {FPGAs}",
  journal =      j-TRETS,
  volume =       "2",
  number =       "2",
  pages =        "15:1--15:??",
  month =        jun,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1534916.1534925",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Mar 16 09:46:50 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "15",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Pellauer:2009:PNP,
  author =       "Michael Pellauer and Muralidaran Vijayaraghavan and
                 Michael Adler and Arvind and Joel Emer",
  title =        "{A}-Port Networks: Preserving the Timed Behavior of
                 Synchronous Systems for Modeling on {FPGAs}",
  journal =      j-TRETS,
  volume =       "2",
  number =       "3",
  pages =        "16:1--16:??",
  month =        sep,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1575774.1575775",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Mar 16 09:46:54 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "16",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Cong:2009:FBH,
  author =       "Jason Cong and Yi Zou",
  title =        "{FPGA}-Based Hardware Acceleration of Lithographic
                 Aerial Image Simulation",
  journal =      j-TRETS,
  volume =       "2",
  number =       "3",
  pages =        "17:1--17:??",
  month =        sep,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1575774.1575776",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Mar 16 09:46:54 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "17",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Ahmed:2009:PTV,
  author =       "Taneem Ahmed and Paul D. Kundarewich and Jason H.
                 Anderson",
  title =        "Packing Techniques for {Virtex-5 FPGAs}",
  journal =      j-TRETS,
  volume =       "2",
  number =       "3",
  pages =        "18:1--18:??",
  month =        sep,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1575774.1575777",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Mar 16 09:46:54 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "18",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Parandeh-Afshar:2009:FLC,
  author =       "Hadi Parandeh-Afshar and Philip Brisk and Paolo
                 Ienne",
  title =        "An {FPGA} Logic Cell and Carry Chain Configurable as a
                 6:2 or 7:2 Compressor",
  journal =      j-TRETS,
  volume =       "2",
  number =       "3",
  pages =        "19:1--19:??",
  month =        sep,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1575774.1575778",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Mar 16 09:46:54 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "19",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Compton:2009:ISI,
  author =       "Katherine Compton and Roger Woods and Christos
                 Bouganis and Pedro Diniz",
  title =        "Introduction to the Special Issue {ARC'08}",
  journal =      j-TRETS,
  volume =       "2",
  number =       "4",
  pages =        "20:1--20:??",
  month =        sep,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1575779.1575780",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Mar 16 09:46:56 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "20",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Jin:2009:ERA,
  author =       "Qiwei Jin and David B. Thomas and Wayne Luk and
                 Benjamin Cope",
  title =        "Exploring Reconfigurable Architectures for Tree-Based
                 Option Pricing Models",
  journal =      j-TRETS,
  volume =       "2",
  number =       "4",
  pages =        "21:1--21:??",
  month =        sep,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1575779.1575781",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Mar 16 09:46:56 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "21",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Angelopoulou:2009:RRT,
  author =       "Maria E. Angelopoulou and Christos-Savvas Bouganis and
                 Peter Y. K. Cheung and George A. Constantinides",
  title =        "Robust Real-Time Super-Resolution on {FPGA} and an
                 Application to Video Enhancement",
  journal =      j-TRETS,
  volume =       "2",
  number =       "4",
  pages =        "22:1--22:??",
  month =        sep,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1575779.1575782",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Mar 16 09:46:56 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "22",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Lo:2009:SOC,
  author =       "Chia-Tien Dan Lo and Yi-Gang Tai",
  title =        "Space Optimization on Counters for {FPGA}-Based {Perl}
                 Compatible Regular Expressions",
  journal =      j-TRETS,
  volume =       "2",
  number =       "4",
  pages =        "23:1--23:??",
  month =        sep,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1575779.1575783",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Mar 16 09:46:56 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "23",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Vassiliadis:2009:ADF,
  author =       "Nikolaos Vassiliadis and George Theodoridis and
                 Spiridon Nikolaidis",
  title =        "An Application Development Framework for {ARISE}
                 Reconfigurable Processors",
  journal =      j-TRETS,
  volume =       "2",
  number =       "4",
  pages =        "24:1--24:??",
  month =        sep,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1575779.1575784",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Mar 16 09:46:56 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "24",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Dragomir:2009:OLU,
  author =       "Ozana Silvia Dragomir and Todor Stefanov and Koen
                 Bertels",
  title =        "Optimal Loop Unrolling and Shifting for Reconfigurable
                 Architectures",
  journal =      j-TRETS,
  volume =       "2",
  number =       "4",
  pages =        "25:1--25:??",
  month =        sep,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1575779.1575785",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Mar 16 09:46:56 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "25",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Underwood:2009:SSL,
  author =       "Keith D. Underwood and K. Scott Hemmert and Craig D.
                 Ulmer",
  title =        "From Silicon to Science: The Long Road to Production
                 Reconfigurable Supercomputing",
  journal =      j-TRETS,
  volume =       "2",
  number =       "4",
  pages =        "26:1--26:??",
  month =        sep,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1575779.1575786",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Mar 16 09:46:56 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "26",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Roldao:2010:HTF,
  author =       "Antonio Roldao and George A. Constantinides",
  title =        "A High Throughput {FPGA}-Based Floating Point
                 Conjugate Gradient Implementation for Dense Matrices",
  journal =      j-TRETS,
  volume =       "3",
  number =       "1",
  pages =        "1:1--1:??",
  month =        jan,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1661438.1661439",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Mar 16 09:47:03 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "1",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Dubois:2010:SMV,
  author =       "David Dubois and Andrew Dubois and Thomas Boorman and
                 Carolyn Connor and Steve Poole",
  title =        "Sparse Matrix-Vector Multiplication on a
                 Reconfigurable Supercomputer with Application",
  journal =      j-TRETS,
  volume =       "3",
  number =       "1",
  pages =        "2:1--2:??",
  month =        jan,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1661438.1661440",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Mar 16 09:47:03 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "2",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Drimer:2010:DBP,
  author =       "Saar Drimer and Tim G{\"u}neysu and Christof Paar",
  title =        "{DSPs}, {BRAMs}, and a Pinch of Logic: Extended
                 Recipes for {AES} on {FPGAs}",
  journal =      j-TRETS,
  volume =       "3",
  number =       "1",
  pages =        "3:1--3:??",
  month =        jan,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1661438.1661441",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Mar 16 09:47:03 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "3",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Koh:2010:CMP,
  author =       "Shannon Koh and Oliver Diessel",
  title =        "Configuration Merging in Point-to-Point Networks for
                 Module-Based {FPGA} Reconfiguration",
  journal =      j-TRETS,
  volume =       "3",
  number =       "1",
  pages =        "4:1--4:??",
  month =        jan,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1661438.1661442",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Mar 16 09:47:03 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "4",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Curreri:2010:PAF,
  author =       "John Curreri and Seth Koehler and Alan D. George and
                 Brian Holland and Rafael Garcia",
  title =        "Performance Analysis Framework for High-Level Language
                 Applications in Reconfigurable Computing",
  journal =      j-TRETS,
  volume =       "3",
  number =       "1",
  pages =        "5:1--5:??",
  month =        jan,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1661438.1661443",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Mar 16 09:47:03 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "5",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Bodily:2010:CSI,
  author =       "John Bodily and Brent Nelson and Zhaoyi Wei and
                 Dah-Jye Lee and Jeff Chase",
  title =        "A Comparison Study on Implementing Optical Flow and
                 Digital Communications on {FPGAs} and {GPUs}",
  journal =      j-TRETS,
  volume =       "3",
  number =       "2",
  pages =        "6:1--6:??",
  month =        may,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1754386.1754387",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Jun 22 16:00:33 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "FPGA devices have often found use as
                 higher-performance alternatives to programmable
                 processors for implementing computations. Applications
                 successfully implemented on FPGAs typically contain
                 high levels of parallelism and often use simple
                 statically scheduled control and modest arithmetic.
                 Recently introduced computing devices such as
                 coarse-grain reconfigurable arrays, multi-core
                 processors, and graphical processing units promise to
                 significantly change the computational landscape and
                 take advantage of many of the same application
                 characteristics that fit well on FPGAs. One real-time
                 computing task, optical flow, is difficult to apply in
                 robotic vision applications because of its high
                 computational and data rate requirements, and so is a
                 good candidate for implementation on FPGAs and other
                 custom computing architectures. This article reports on
                 a series of experiments mapping a collection of
                 different algorithms onto both an FPGA and a GPU. For
                 two different optical flow algorithms the GPU had
                 better performance, while for a set of digital comm
                 MIMO computations, they had similar performance. In all
                 cases the FPGA implementations required 10x the
                 development time. Finally, a discussion of the two
                 technology's characteristics is given to show they
                 achieve high performance in different ways.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "6",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
  keywords =     "Digital communications; FPGA; GPU; optical flow;
                 reconfigurable computing",
}

@Article{Papadopoulos:2010:TRM,
  author =       "Konstantinos Papadopoulos and Ioannis Papaefstathiou",
  title =        "{Titan-R}: a Multigigabit Reconfigurable Combined
                 Compression\slash Decompression Unit",
  journal =      j-TRETS,
  volume =       "3",
  number =       "2",
  pages =        "7:1--7:??",
  month =        may,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1754386.1754388",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Jun 22 16:00:33 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Data compression techniques can alleviate bandwidth
                 problems in even multigigabit networks and are
                 especially useful when combined with encryption. This
                 article demonstrates a reconfigurable hardware
                 compressor/decompressor core, the Titan-R, which can
                 compress/decompress data streams at 8.5 Gb/sec, making
                 it the fastest reconfigurable such device ever
                 proposed; the presented full-duplex implementation
                 allows for fully symmetric compression and
                 decompression rates at 8.5 Gbps each. Its compression
                 algorithm is a variation of the most widely used and
                 efficient such scheme, the Lempel--Ziv (LZ) algorithm
                 that uses part of the previous input stream as the
                 dictionary. In order to support this high network
                 throughput, the Titan-R utilizes a very fine-grained
                 pipeline and takes advantage of the high bandwidth
                 provided by the distributed on-chip RAMs of
                 state-of-the-art FPGAs.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "7",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
  keywords =     "data compression; FPGA; hardware algorithms;
                 networking; parallel processing; reconfigurable
                 computing; Stream processing",
}

@Article{Badrignans:2010:SSA,
  author =       "Beno{\^\i}t Badrignans and David Champagne and Reouven
                 Elbaz and Catherine Gebotys and Lionel Torres",
  title =        "{SARFUM}: Security Architecture for Remote {FPGA}
                 Update and Monitoring",
  journal =      j-TRETS,
  volume =       "3",
  number =       "2",
  pages =        "8:1--8:??",
  month =        may,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1754386.1754389",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Jun 22 16:00:33 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Remote update of hardware platforms or embedded
                 systems is a convenient service enabled by Field
                 Programmable Gate Array (FPGA)-based systems. This
                 service is often essential in applications like
                 space-based FPGA systems or set-top boxes. However,
                 having the source of the update be remote from the FPGA
                 system opens the door to a set of attacks that may
                 challenge the confidentiality and integrity of the FPGA
                 configuration, the bitstream. Existing schemes propose
                 to encrypt and authenticate the bitstream to thwart
                 these attacks. However, we show that they do not
                 prevent the replay of old bitstream versions, and thus
                 give adversaries an opportunity for downgrading the
                 system. In this article, we propose a new architecture
                 called\par

                 sarfum that, in addition to ensuring bitstream
                 confidentiality and integrity, precludes the replay of
                 old bitstreams. sarfum also includes a protocol for the
                 system designer to remotely monitor the running
                 configuration of the FPGA. Following our presentation
                 and analysis of the security protocols, we propose an
                 example of implementation with the CCM (Counter with
                 CBC-MAC) authenticated encryption standard. We also
                 evaluate the impact of our architecture on the
                 configuration time for different FPGA devices.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "8",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
  keywords =     "authenticated encryption; bitstream security; FPGA;
                 replay attack; security protocol; system downgrade",
}

@Article{Yoo:2010:IRR,
  author =       "Sang-Kyung Yoo and Deniz Karakoyunlu and Berk Birand
                 and Berk Sunar",
  title =        "Improving the Robustness of Ring Oscillator {TRNGs}",
  journal =      j-TRETS,
  volume =       "3",
  number =       "2",
  pages =        "9:1--9:??",
  month =        may,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1754386.1754390",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Jun 22 16:00:33 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "A ring oscillator-based true-random number generator
                 design (Rings design) was introduced in Sunar et al.
                 [2007]. The design was rigorously analyzed under a
                 simple mathematical model and its performance
                 characteristics were established. In this article we
                 focus on the practical aspects of the Rings design on a
                 reconfigurable logic platform and determine their
                 implications on the earlier analysis framework. We make
                 recommendations for avoiding pitfalls in real-life
                 implementations by considering ring interaction,
                 transistor-level effects, narrow signal rejection,
                 transmission line attenuation, and sampler bias.
                 Furthermore, we present experimental results showing
                 that changing operating conditions such as the power
                 supply voltage or the operating temperature may affect
                 the output quality when the signal is subsampled.
                 Hence, an attacker may shift the operating point via a
                 simple noninvasive influence and easily bias the TRNG
                 output. Finally, we propose modifications to the design
                 which significantly improve its robustness against
                 attacks, alleviate implementation-related problems, and
                 simultaneously improve its area, throughput, and power
                 performance.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "9",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
  keywords =     "cryptography; Oscillator rings; true random number
                 generators",
}

@Article{Huffmire:2010:SPR,
  author =       "Ted Huffmire and Timothy Levin and Thuy Nguyen and
                 Cynthia Irvine and Brett Brotherton and Gang Wang and
                 Timothy Sherwood and Ryan Kastner",
  title =        "Security Primitives for Reconfigurable Hardware-Based
                 Systems",
  journal =      j-TRETS,
  volume =       "3",
  number =       "2",
  pages =        "10:1--10:??",
  month =        may,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1754386.1754391",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Jun 22 16:00:33 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Computing systems designed using reconfigurable
                 hardware are increasingly composed using a number of
                 different Intellectual Property (IP) cores, which are
                 often provided by third-party vendors that may have
                 different levels of trust. Unlike traditional software
                 where hardware resources are mediated using an
                 operating system, IP cores have fine-grain control over
                 the underlying reconfigurable hardware. To address this
                 problem, the embedded systems community requires novel
                 security primitives that address the realities of
                 modern reconfigurable hardware. In this work, we
                 propose security primitives using ideas centered around
                 the notion of ``moats and drawbridges.'' The primitives
                 encompass four design properties: logical isolation,
                 interconnect traceability, secure reconfigurable
                 broadcast, and configuration scrubbing. Each of these
                 is a fundamental operation with easily understood
                 formal properties, yet they map cleanly and efficiently
                 to a wide variety of reconfigurable devices. We
                 carefully quantify the required overheads of the
                 security techniques on modern FPGA architectures across
                 a number of different applications.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "10",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
  keywords =     "Advanced Encryption Standard (AES); controlled
                 sharing; enforcement mechanisms; execution monitors;
                 Field Programmable Gate Arrays (FPGAs); hardware
                 security; isolation; memory protection; reference
                 monitors; security policies; security primitives;
                 separation; static analysis; Systems-on-a-Chip (SoCs)",
}

@Article{Hemmert:2010:FEF,
  author =       "K. Scott Hemmert and Keith D. Underwood",
  title =        "Fast, Efficient Floating-Point Adders and Multipliers
                 for {FPGAs}",
  journal =      j-TRETS,
  volume =       "3",
  number =       "3",
  pages =        "11:1--11:??",
  month =        sep,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1839480.1839481",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Fri Oct 8 18:26:34 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Floating-point applications are a growing trend in the
                 FPGA community. As such, it has become critical to
                 create floating-point units optimized for standard FPGA
                 technology. Unfortunately, the FPGA design space is
                 very different from the VLSI design space; thus,
                 optimizations for FPGAs can differ significantly from
                 optimizations for VLSI. In particular, the FPGA
                 environment constrains the design space such that only
                 limited parallelism can be effectively exploited to
                 reduce latency. Obtaining the right balances between
                 clock speed, latency, and area in FPGAs can be
                 particularly challenging. This article presents
                 implementation details for an IEEE-754 standard
                 floating-point adder and multiplier for FPGAs. The
                 designs presented here enable a Xilinx Virtex4 FPGA
                 (-11 speed grade) to achieve 270 MHz IEEE compliant
                 double precision floating-point performance with a
                 9-stage adder pipeline and 14-stage multiplier
                 pipeline. The area requirement is approximately 500
                 slices for the adder and under 750 slices for the
                 multiplier.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "11",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
  keywords =     "floating point; FPGA; HPC; reconfigurable computing",
}

@Article{Sghaier:2010:IAT,
  author =       "Ahmad Sghaier and Shawki Areibi and Robert Dony",
  title =        "Implementation Approaches Trade-Offs for {WiMax OFDM}
                 Functions on Reconfigurable Platforms",
  journal =      j-TRETS,
  volume =       "3",
  number =       "3",
  pages =        "12:1--12:??",
  month =        sep,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1839480.1839482",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Fri Oct 8 18:26:34 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "This work investigates several approaches for
                 implementing the OFDM functions of the fixed-WiMax
                 standard on reconfigurable platforms. In the first
                 phase, a custom RTL approach, using VHDL, is
                 investigated. The approach shows the capability of a
                 medium-size FPGA to accommodate the OFDM functions of a
                 fixed-WiMax transceiver with only 50\% occupation rate.
                 In the second phase, a high-level approach based on the
                 AccelDSP tool is used and compared to the custom RTL
                 approach. The approach presents an easy flow to
                 transfer MATLAB floating-point code into synthesizable
                 cores. The AccelDSP approach shows an area overhead of
                 10\%, while allowing early architectural exploration
                 and accelerating the design time by a factor of two.
                 However, the performance figure obtained is almost 1/4
                 of that obtained in the custom RTL approach. In the
                 third phase, the Tensilica Xtensa configurable
                 processor is targeted, which presents remarkable
                 figures in terms of power, area, and design time.
                 Comparing the three approaches indicates that the
                 custom RTL approach has the lead in terms of
                 performance. However, both the AccelDSP and the
                 Tensilica Xtensa approaches show fast design time and
                 early architectural exploration capability. In terms of
                 power, the obtained estimation results show that the
                 configurable Xtensa processor approach has the lead,
                 where approximately the total power consumed is about
                 12--15 times less than those results obtained by the
                 other two approaches.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "12",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
  keywords =     "AccelDSP; ASIP; custom RTL; FPGA; Tensilica; WiMax",
}

@Article{Smith:2010:AFA,
  author =       "Alastair M. Smith and George A. Constantinides and
                 Peter Y. K. Cheung",
  title =        "An Automated Flow for Arithmetic Component Generation
                 in Field-Programmable Gate Arrays",
  journal =      j-TRETS,
  volume =       "3",
  number =       "3",
  pages =        "13:1--13:??",
  month =        sep,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1839480.1839483",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Fri Oct 8 18:26:34 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "State-of-the-art configurable logic platforms, such as
                 Field-Programmable Gate Arrays (FPGAs), consist of a
                 heterogeneous mixture of different component types.
                 Compared to traditional homogeneous configurable
                 platforms, heterogeneity provides speed and density
                 advantages. This is due to the replacement of
                 inefficient programmable logic and routing with
                 specialized logic and fixed interconnect in components
                 such as memories, embedded processor units, and fused
                 arithmetic units. Given the increasing complexity of
                 these components, this article introduces a method to
                 automatically propose and explore the benefits of
                 different types of fused arithmetic units. The methods
                 are based on common subgraph extraction techniques,
                 meaning that it is possible to explore different
                 subcircuits that occur frequently across a set of
                 benchmarks. A quantitative analysis is performed of the
                 various fused arithmetic circuits identified by our
                 tool, which are then automatically synthesized to an
                 ASIC process, providing a study of the speed and area
                 benefits of the components. The results of this study
                 provide bounds on the performance of heterogeneous
                 FPGAs: by incorporating coarse-grain components which
                 match the specific needs of a set of benchmarks we show
                 that significant improvements in circuit speed and area
                 can be made.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "13",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
  keywords =     "common subgraph; FPGA; reconfigurable logic",
}

@Article{Moscola:2010:HAR,
  author =       "James Moscola and Ron K. Cytron and Young H. Cho",
  title =        "Hardware-Accelerated {RNA} Secondary-Structure
                 Alignment",
  journal =      j-TRETS,
  volume =       "3",
  number =       "3",
  pages =        "14:1--14:??",
  month =        sep,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1839480.1839484",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Fri Oct 8 18:26:34 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "The search for homologous RNA molecules---sequences of
                 RNA that might behave similarly due to similarity in
                 their physical (secondary) structure---is currently a
                 computationally intensive task. Moreover, RNA sequences
                 are populating genome databases at a pace unmatched by
                 gains in standard processor performance. While software
                 tools such as Infernal can efficiently find homologies
                 among RNA families and genome databases of modest size,
                 the continuous advent of new RNA families and the
                 explosive growth in volume of RNA sequences necessitate
                 a faster approach.\par

                 This work introduces two different architectures for
                 accelerating the task of finding homologous RNA
                 molecules in a genome database. The first architecture
                 takes advantage of the tree-like configuration of the
                 covariance models used to represent the consensus
                 secondary structure of an RNA family and converts it
                 directly into a highly-pipelined processing engine.
                 Results for this architecture show a 24$ \times $
                 speedup over Infernal when processing a small RNA
                 model. It is estimated that the architecture could
                 potentially offer several thousands of times speedup
                 over Infernal on larger models, provided that there are
                 sufficient hardware resources available.\par

                 The second architecture is introduced to address the
                 steep resource requirements of the first architecture.
                 It utilizes a uniform array of processing elements and
                 schedules all of the computations required to scan for
                 an RNA homolog onto those processing elements. The
                 estimated speedup for this architecture over the
                 Infernal software package ranges from just under 20$
                 \times $ to over 2,350$ \times $.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "14",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
  keywords =     "Bioinformatics; RNA; secondary-structure alignment",
}

@Article{Ben-Asher:2010:RMC,
  author =       "Yosi Ben-Asher and Danny Meisler and Nadav Rotem",
  title =        "Reducing Memory Constraints in Modulo Scheduling
                 Synthesis for {FPGAs}",
  journal =      j-TRETS,
  volume =       "3",
  number =       "3",
  pages =        "15:1--15:??",
  month =        sep,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1839480.1839485",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Fri Oct 8 18:26:34 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "In High-Level Synthesis (HLS), extracting parallelism
                 in order to create small and fast circuits is the main
                 advantage of HLS over software execution. Modulo
                 Scheduling (MS) is a technique in which a loop is
                 parallelized by overlapping different parts of
                 successive iterations. This ability to extract
                 parallelism makes MS an attractive synthesis technique
                 for loop acceleration. In this work we consider two
                 problems involved in the use of MS which are central
                 when targeting FPGAs. Current MS scheduling techniques
                 sacrifice execution times in order to meet resource and
                 delay constraints. Let ``ideal'' execution times be the
                 ones that could have been obtained by MS had we ignored
                 resource and delay constraints. Here we pose the
                 opposite problem, which is more suitable for HLS,
                 namely, how to reduce resource constraints without
                 sacrificing the ideal execution time. We focus on
                 reducing the number of memory ports used by the MS
                 synthesis, which we believe is a crucial resource for
                 HLS. In addition to reducing the number of memory ports
                 we consider the need to develop MS techniques that are
                 fast enough to allow interactive synthesis times and
                 repeated applications of the MS to explore different
                 possibilities of synthesizing the circuits. Current
                 solutions for MS synthesis that can handle memory
                 constraints are too slow to support interactive
                 synthesis. We formalize the problem of reducing the
                 number of parallel memory references in every row of
                 the kernel by a novel combinatorial setting. The
                 proposed technique is based on inserting dummy
                 operations in the kernel and by doing so, performing
                 modulo-shift operations such that the maximal number of
                 parallel memory references in a row is reduced.
                 Experimental results suggest improved execution times
                 for the synthesized circuit. The synthesis takes only a
                 few seconds even for large-size loops.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "15",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
  keywords =     "FPGA; high-level synthesis; memory optimizations;
                 modulo-scheduling",
}

@Article{Wang:2010:VVP,
  author =       "Xiaojun Wang and Miriam Leeser",
  title =        "{VFloat}: a Variable Precision Fixed- and
                 Floating-Point Library for Reconfigurable Hardware",
  journal =      j-TRETS,
  volume =       "3",
  number =       "3",
  pages =        "16:1--16:??",
  month =        sep,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1839480.1839486",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Fri Oct 8 18:26:34 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/fparith.bib;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Optimal reconfigurable hardware implementations may
                 require the use of arbitrary floating-point formats
                 that do not necessarily conform to IEEE specified
                 sizes. We present a variable precision floating-point
                 library (VFloat) that supports general floating-point
                 formats including IEEE standard formats. Most
                 previously published floating-point formats for use
                 with reconfigurable hardware are subsets of our format.
                 Custom datapaths with optimal bitwidths for each
                 operation can be built using the variable precision
                 hardware modules in the VFloat library, enabling a
                 higher level of parallelism. The VFloat library
                 includes three types of hardware modules for format
                 control, arithmetic operations, and conversions between
                 fixed-point and floating-point formats. The format
                 conversions allow for hybrid fixed- and floating-point
                 operations in a single design. This gives the designer
                 control over a large number of design possibilities
                 including format as well as number range within the
                 same application. In this article, we give an overview
                 of the components in the VFloat library and demonstrate
                 their use in an implementation of the K-means
                 clustering algorithm applied to multispectral satellite
                 images.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "16",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
  keywords =     "clustering; floating-point; Reconfigurable hardware",
}

@Article{Purnaprajna:2010:RRM,
  author =       "Madhura Purnaprajna and Mario Porrmann and Ulrich
                 Rueckert and Michael Hussmann and Michael Thies and Uwe
                 Kastens",
  title =        "Runtime Reconfiguration of Multiprocessors Based on
                 Compile-Time Analysis",
  journal =      j-TRETS,
  volume =       "3",
  number =       "3",
  pages =        "17:1--17:??",
  month =        sep,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1839480.1839487",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Fri Oct 8 18:26:34 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "In multiprocessors, performance improvement is
                 typically achieved by exploring parallelism with fixed
                 granularities, such as instruction-level, task-level,
                 or data-level parallelism. We introduce a new
                 reconfiguration mechanism that facilitates variations
                 in these granularities in order to optimize resource
                 utilization in addition to performance improvements.
                 Our reconfigurable multiprocessor QuadroCore combines
                 the advantages of reconfigurability and parallel
                 processing. In this article, a unified
                 hardware-software approach for the design of our
                 QuadroCore is presented. This design flow is enabled
                 via compiler-driven reconfiguration which matches
                 application-specific characteristics to a fixed set of
                 architectural variations. A special reconfiguration
                 mechanism has been developed that alters the
                 architecture within a single clock cycle.\par

                 The QuadroCore has been implemented on Xilinx XC2V6000
                 for functional validation and on UMC's 90nm standard
                 cell technology for performance estimation. A diverse
                 set of applications have been mapped onto the
                 reconfigurable multiprocessor to meet orthogonal
                 performance characteristics in terms of time and power.
                 Speedup measurements show a 2--11 times performance
                 increase in comparison to a single processor.
                 Additionally, the reconfiguration scheme has been
                 applied to save power in data-parallel applications.
                 Gate-level simulations have been performed to measure
                 the power-performance trade-offs for two
                 computationally complex applications. The power reports
                 confirm that introducing this scheme of reconfiguration
                 results in power savings in the range of 15--24\%.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "17",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
  keywords =     "compilation for multiprocessors; Reconfigurable
                 multiprocessors",
}

@Article{Banerjee:2010:BMA,
  author =       "Sudarshan Banerjee and Elaheh Bozorgzadeh and Juanjo
                 Noguera and Nikil Dutt",
  title =        "Bandwidth Management in Application Mapping for
                 Dynamically Reconfigurable Architectures",
  journal =      j-TRETS,
  volume =       "3",
  number =       "3",
  pages =        "18:1--18:??",
  month =        sep,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1839480.1839488",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Fri Oct 8 18:26:34 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Partial dynamic reconfiguration (often referred to as
                 partial RTR) enables true on-demand computing. In an
                 on-demand computing environment, a dynamically invoked
                 application is assigned resources such as data
                 bandwidth, configurable logic. The limited logic
                 resources are customized during application execution
                 by exploiting partial RTR. In this article, we propose
                 an approach that maximizes application performance when
                 available bandwidth and logic resources are limited.
                 Our proposed approach is based on theoretical
                 principles of minimizing application schedule length
                 under bandwidth and logic resource constraints. It
                 includes detailed microarchitectural considerations on
                 a commercially popular reconfigurable device, and it
                 exploits partial RTR very effectively by utilizing
                 data-parallelism property of common image-processing
                 applications. We present extensive application case
                 studies on a cycle-accurate simulation platform that
                 includes detailed resource considerations of the Xilinx
                 Virtex XC2V3000. Our experimental results demonstrate
                 that applying our proposed approach to common
                 image-filtering applications leads to 15--20\%
                 performance gain in scenarios with limited bandwidth,
                 when compared to prior work that also exploits
                 data-parallelism with RTR but includes simpler
                 bandwidth considerations. Last but not the least, we
                 also demonstrate how our proposed theoretical
                 principles can be directly applied to solve related
                 problems such as minimizing schedule length under logic
                 resource and power constraints.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "18",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
  keywords =     "bandwidth; Partial RTR; scheduling",
}

@Article{Williams:2010:CFR,
  author =       "Jason Williams and Chris Massie and Alan D. George and
                 Justin Richardson and Kunal Gosrani and Herman Lam",
  title =        "Characterization of Fixed and Reconfigurable
                 Multi-Core Devices for Application Acceleration",
  journal =      j-TRETS,
  volume =       "3",
  number =       "4",
  pages =        "19:1--19:??",
  month =        nov,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1862648.1862649",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Nov 23 11:26:33 MST 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "19",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Huang:2010:RCA,
  author =       "Miaoqing Huang and Vikram K. Narayana and Harald
                 Simmler and Olivier Serres and Tarek El-Ghazawi",
  title =        "Reconfiguration and Communication-Aware Task
                 Scheduling for High-Performance Reconfigurable
                 Computing",
  journal =      j-TRETS,
  volume =       "3",
  number =       "4",
  pages =        "20:1--20:??",
  month =        nov,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1862648.1862650",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Nov 23 11:26:33 MST 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "20",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Sano:2010:FAB,
  author =       "Kentaro Sano and Wang Luzhou and Yoshiaki Hatsuda and
                 Takanori Iizuka and Satoru Yamamoto",
  title =        "{FPGA}-Array with Bandwidth-Reduction Mechanism for
                 Scalable and Power-Efficient Numerical Simulations
                 Based on Finite Difference Methods",
  journal =      j-TRETS,
  volume =       "3",
  number =       "4",
  pages =        "21:1--21:??",
  month =        nov,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1862648.1862651",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Nov 23 11:26:33 MST 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "21",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Saldana:2010:MPM,
  author =       "Manuel Salda{\~n}a and Arun Patel and Christopher
                 Madill and Daniel Nunes and Danyao Wang and Paul Chow
                 and Ralph Wittig and Henry Styles and Andrew Putnam",
  title =        "{MPI} as a Programming Model for High-Performance
                 Reconfigurable Computers",
  journal =      j-TRETS,
  volume =       "3",
  number =       "4",
  pages =        "22:1--22:??",
  month =        nov,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1862648.1862652",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Nov 23 11:26:33 MST 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "22",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Chiu:2010:MDS,
  author =       "Matt Chiu and Martin C. Herbordt",
  title =        "Molecular Dynamics Simulations on High-Performance
                 Reconfigurable Computing Systems",
  journal =      j-TRETS,
  volume =       "3",
  number =       "4",
  pages =        "23:1--23:??",
  month =        nov,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1862648.1862653",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Nov 23 11:26:33 MST 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "23",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Montone:2010:PFD,
  author =       "Alessio Montone and Marco D. Santambrogio and
                 Donatella Sciuto and Seda Ogrenci Memik",
  title =        "Placement and Floorplanning in Dynamically
                 Reconfigurable {FPGAs}",
  journal =      j-TRETS,
  volume =       "3",
  number =       "4",
  pages =        "24:1--24:??",
  month =        nov,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1862648.1862654",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Nov 23 11:26:33 MST 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "24",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Reardon:2010:SFR,
  author =       "Casey Reardon and Eric Grobelny and Alan D. George and
                 Gongyu Wang",
  title =        "A Simulation Framework for Rapid Analysis of
                 Reconfigurable Computing Systems",
  journal =      j-TRETS,
  volume =       "3",
  number =       "4",
  pages =        "25:1--25:??",
  month =        nov,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1862648.1862655",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Nov 23 11:26:33 MST 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "25",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Tian:2010:HPQ,
  author =       "Xiang Tian and Khaled Benkrid",
  title =        "High-Performance Quasi-{Monte Carlo} Financial
                 Simulation: {FPGA} vs. {GPP} vs. {GPU}",
  journal =      j-TRETS,
  volume =       "3",
  number =       "4",
  pages =        "26:1--26:??",
  month =        nov,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1862648.1862656",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Nov 23 11:26:33 MST 2010",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "26",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Woods:2010:GEA,
  author =       "Roger Woods and J{\"u}rgen Becker and Peter Athanas
                 and Fearghal Morgan",
  title =        "Guest Editorial {ARC 2009}",
  journal =      j-TRETS,
  volume =       "4",
  number =       "1",
  pages =        "1:1--1:??",
  month =        dec,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1857927.1857928",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Wed Jan 26 14:58:50 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "1",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Saiprasert:2010:OHA,
  author =       "Chalermpol Saiprasert and Christos-S. Bouganis and
                 George A. Constantinides",
  title =        "An Optimized Hardware Architecture of a Multivariate
                 {Gaussian} Random Number Generator",
  journal =      j-TRETS,
  volume =       "4",
  number =       "1",
  pages =        "2:1--2:??",
  month =        dec,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1857927.1857929",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Wed Jan 26 14:58:50 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Monte Carlo simulation is one of the most widely used
                 techniques for computationally intensive simulations in
                 mathematical analysis and modeling. A multivariate
                 Gaussian random number generator is one of the main
                 building blocks of such a system. Field Programmable
                 Gate Arrays (FPGAs) are gaining increased popularity as
                 an alternative means to the traditional general purpose
                 processors targeting the acceleration of the
                 computationally expensive random number generator
                 block. This article presents a novel approach for
                 mapping a multivariate Gaussian random number generator
                 onto an FPGA by optimizing the computational path in
                 terms of hardware resource usage subject to an
                 acceptable error in the approximation of the
                 distribution of interest. The proposed approach is
                 based on the eigenvalue decomposition algorithm which
                 leads to a design with different precision requirements
                 in the computational paths. An analysis on the impact
                 of the error due to truncation/rounding operation along
                 the computational path is performed and an analytical
                 expression of the error inserted into the system is
                 presented. Based on the error analysis, three
                 algorithms that optimize the resource utilization and
                 at the same time minimize the error in the output of
                 the system are presented and compared. Experimental
                 results reveal that the hardware resource usage on an
                 FPGA as well as the error in the approximation of the
                 distribution of interest are significantly reduced by
                 the use of the optimization techniques introduced in
                 the proposed approach.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "2",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Kahoul:2010:EHA,
  author =       "Asma Kahoul and Alastair M. Smith and George A.
                 Constantinides and Peter Y. K. Cheung",
  title =        "Efficient Heterogeneous Architecture Floorplan
                 Optimization using Analytical Methods",
  journal =      j-TRETS,
  volume =       "4",
  number =       "1",
  pages =        "3:1--3:??",
  month =        dec,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1857927.1857930",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Wed Jan 26 14:58:50 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "3",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Kepa:2010:DAS,
  author =       "K. Kepa and F. Morgan and K. Ko{\'s}ciuszkiewicz and
                 L. Braun and M. H{\"u}bner and J. Becker",
  title =        "Design Assurance Strategy and Toolset for Partially
                 Reconfigurable {FPGA} Systems",
  journal =      j-TRETS,
  volume =       "4",
  number =       "1",
  pages =        "4:1--4:??",
  month =        dec,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1857927.1857931",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Wed Jan 26 14:58:50 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "4",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Inoue:2010:VGL,
  author =       "Kazuki Inoue and Qian Zhao and Yasuhiro Okamoto and
                 Hiroki Yosho and Motoki Amagasaki and Masahiro Iida and
                 Toshinori Sueyoshi",
  title =        "A Variable-Grain Logic Cell and Routing Architecture
                 for a Reconfigurable {IP} Core",
  journal =      j-TRETS,
  volume =       "4",
  number =       "1",
  pages =        "5:1--5:??",
  month =        dec,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1857927.1857932",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Wed Jan 26 14:58:50 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "5",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Guo:2010:OSC,
  author =       "Xu Guo and Patrick Schaumont",
  title =        "Optimized System-on-Chip Integration of a Programmable
                 {ECC} Coprocessor",
  journal =      j-TRETS,
  volume =       "4",
  number =       "1",
  pages =        "6:1--6:??",
  month =        dec,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1857927.1857933",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Wed Jan 26 14:58:50 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "6",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Sterpone:2010:NTD,
  author =       "Luca Sterpone",
  title =        "A New Timing Driven Placement Algorithm for Dependable
                 Circuits on {SRAM}-based {FPGAs}",
  journal =      j-TRETS,
  volume =       "4",
  number =       "1",
  pages =        "7:1--7:??",
  month =        dec,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1857927.1857934",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Wed Jan 26 14:58:50 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "7",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Lanuzza:2010:ESR,
  author =       "M. Lanuzza and P. Zicari and F. Frustaci and S. Perri
                 and P. Corsonello",
  title =        "Exploiting Self-Reconfiguration Capability to Improve
                 {SRAM}-based {FPGA} Robustness in Space and Avionics
                 Applications",
  journal =      j-TRETS,
  volume =       "4",
  number =       "1",
  pages =        "8:1--8:??",
  month =        dec,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1857927.1857935",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Wed Jan 26 14:58:50 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "8",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Hsiung:2010:SPH,
  author =       "Pao-Ann Hsiung and Chun-Hsian Huang and Jih-Sheng Shen
                 and Chen-Chi Chiang",
  title =        "Scheduling and Placement of Hardware\slash Software
                 Real-Time Relocatable Tasks in Dynamically Partially
                 Reconfigurable Systems",
  journal =      j-TRETS,
  volume =       "4",
  number =       "1",
  pages =        "9:1--9:??",
  month =        dec,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1857927.1857936",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Wed Jan 26 14:58:50 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "9",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Kanazawa:2010:ASL,
  author =       "Kenji Kanazawa and Tsutomu Maruyama",
  title =        "An Approach for Solving Large {SAT} Problems on
                 {FPGA}",
  journal =      j-TRETS,
  volume =       "4",
  number =       "1",
  pages =        "10:1--10:??",
  month =        dec,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1857927.1857937",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Wed Jan 26 14:58:50 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "10",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Lu:2010:ERD,
  author =       "Yingxi Lu and Maire O'Neill and John McCanny",
  title =        "Evaluation of Random Delay Insertion against {DPA} on
                 {FPGAs}",
  journal =      j-TRETS,
  volume =       "4",
  number =       "1",
  pages =        "11:1--11:??",
  month =        dec,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1857927.1857938",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Wed Jan 26 14:58:50 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "11",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Bergeron:2011:LTF,
  author =       "Etienne Bergeron and Louis-David Perron and Marc
                 Feeley and Jean Pierre David",
  title =        "Logarithmic-Time {FPGA} Bitstream Analysis: a Step
                 Towards {JIT} Hardware Compilation",
  journal =      j-TRETS,
  volume =       "4",
  number =       "2",
  pages =        "12:1--12:??",
  month =        may,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1968502.1968503",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Jun 7 18:34:54 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "12",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Vaidya:2011:NMC,
  author =       "Pranav Vaidya and Jaehwan John Lee",
  title =        "A Novel Multicontext Coarse-Grained Reconfigurable
                 Architecture {(CGRA)} For Accelerating Column-Oriented
                 Databases",
  journal =      j-TRETS,
  volume =       "4",
  number =       "2",
  pages =        "13:1--13:??",
  month =        may,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1968502.1968504",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Jun 7 18:34:54 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "13",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{ONeill:2011:SPM,
  author =       "Shane O'Neill and Roger Francis Woods and Alan James
                 Marshall and Qi Zhang",
  title =        "A Scalable and Programmable Modular Traffic Manager
                 Architecture",
  journal =      j-TRETS,
  volume =       "4",
  number =       "2",
  pages =        "14:1--14:??",
  month =        may,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1968502.1968505",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Jun 7 18:34:54 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "14",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Nakajima:2011:FOR,
  author =       "Mao Nakajima and Minoru Watanabe",
  title =        "Fast Optical Reconfiguration of a Nine-Context {DORGA}
                 Using a Speed Adjustment Control",
  journal =      j-TRETS,
  volume =       "4",
  number =       "2",
  pages =        "15:1--15:??",
  month =        may,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1968502.1968506",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Jun 7 18:34:54 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "15",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Tai:2011:POA,
  author =       "Tzu-Chiang Tai and Yen-Tai Lai",
  title =        "A Performance-Oriented Algorithm with Consideration on
                 Communication Cost for Dynamically Reconfigurable
                 {FPGA} Partitioning",
  journal =      j-TRETS,
  volume =       "4",
  number =       "2",
  pages =        "16:1--16:??",
  month =        may,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1968502.1968507",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Jun 7 18:34:54 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "16",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Demertzi:2011:DSO,
  author =       "Melina Demertzi and Pedro C. Diniz and Mary W. Hall
                 and Anna C. Gilbert and Yi Wang",
  title =        "Domain-Specific Optimization of Signal Recognition
                 Targeting {FPGAs}",
  journal =      j-TRETS,
  volume =       "4",
  number =       "2",
  pages =        "17:1--17:??",
  month =        may,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1968502.1968508",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Jun 7 18:34:54 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "17",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Galuzzi:2011:ISE,
  author =       "Carlo Galuzzi and Koen Bertels",
  title =        "The Instruction-Set Extension Problem: a Survey",
  journal =      j-TRETS,
  volume =       "4",
  number =       "2",
  pages =        "18:1--18:28",
  month =        may,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1968502.1968509",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Jun 7 18:34:54 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "18",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Rupnow:2011:SAD,
  author =       "Kyle Rupnow and Keith D. Underwood and Katherine
                 Compton",
  title =        "Scientific Application Demands on a Reconfigurable
                 Functional Unit Interface",
  journal =      j-TRETS,
  volume =       "4",
  number =       "2",
  pages =        "19:1--19:??",
  month =        may,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1968502.1968510",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Jun 7 18:34:54 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "19",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Kaganov:2011:FAM,
  author =       "Alexander Kaganov and Asif Lakhany and Paul Chow",
  title =        "{FPGA} Acceleration of {MultiFactor CDO} Pricing",
  journal =      j-TRETS,
  volume =       "4",
  number =       "2",
  pages =        "20:1--20:??",
  month =        may,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1968502.1968511",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Jun 7 18:34:54 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "20",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Labrecque:2011:ASS,
  author =       "Martin Labrecque and Mark C. Jeffrey and J. Gregory
                 Steffan",
  title =        "Application-specific signatures for transactional
                 memory in soft processors",
  journal =      j-TRETS,
  volume =       "4",
  number =       "3",
  pages =        "21:1--21:??",
  month =        aug,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2000832.2000833",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Aug 30 08:13:57 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "21",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Boland:2011:OMB,
  author =       "David Boland and George A. Constantinides",
  title =        "Optimizing memory bandwidth use and performance for
                 matrix-vector multiplication in iterative methods",
  journal =      j-TRETS,
  volume =       "4",
  number =       "3",
  pages =        "22:1--22:??",
  month =        aug,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2000832.2000834",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Aug 30 08:13:57 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "22",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Glaser:2011:TFT,
  author =       "Johann Glaser and Markus Damm and Jan Haase and
                 Christoph Grimm",
  title =        "{TR-FSM}: Transition-Based reconfigurable finite state
                 machine",
  journal =      j-TRETS,
  volume =       "4",
  number =       "3",
  pages =        "23:1--23:??",
  month =        aug,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2000832.2000835",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Aug 30 08:13:57 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "23",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Parvez:2011:ASF,
  author =       "Husain Parvez and Zied Marrakchi and Alp Kilic and
                 Habib Mehrez",
  title =        "Application-Specific {FPGA} using heterogeneous logic
                 blocks",
  journal =      j-TRETS,
  volume =       "4",
  number =       "3",
  pages =        "24:1--24:??",
  month =        aug,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2000832.2000836",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Aug 30 08:13:57 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "24",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Yan:2011:FBA,
  author =       "Jing Yan and Ning-Yi Xu and Xiong-Fei Cai and Rui Gao
                 and Yu Wang and Rong Luo and Feng-Hsiung Hsu",
  title =        "An {FPGA}-based accelerator for {LambdaRank} in {Web}
                 search engines",
  journal =      j-TRETS,
  volume =       "4",
  number =       "3",
  pages =        "25:1--25:??",
  month =        aug,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2000832.2000837",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Aug 30 08:13:57 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "In modern Web search engines, Neural Network
                 (NN)-based learning to rank algorithms is intensively
                 used to increase the quality of search results.
                 LambdaRank is one such algorithm. However, it is hard
                 to be efficiently accelerated by computer clusters or
                 GPUs, because: (i) the cost function for the ranking
                 problem is much more complex than that of traditional
                 Back-Propagation(BP) NNs, and (ii) no coarse-grained
                 parallelism exists in the algorithm. This article
                 presents an FPGA-based accelerator solution to provide
                 high computing performance with low power consumption.
                 A compact deep pipeline is proposed to handle the
                 complex computing in the batch updating. The area
                 scales linearly with the number of hidden nodes in the
                 algorithm. We also carefully design a data format to
                 enable streaming consumption of the training data from
                 the host computer. The accelerator shows up to 15.3X
                 (with PCIe x4) and 23.9X (with PCIe x8) speedup
                 compared with the pure software implementation on
                 datasets from a commercial search engine.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "25",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Aggarwal:2011:SMP,
  author =       "Vikas Aggarwal and Alan D. George and Changil Yoon and
                 Kishore Yalamanchili and Herman Lam",
  title =        "{SHMEM+}: a multilevel-{PGAS} programming model for
                 reconfigurable supercomputing",
  journal =      j-TRETS,
  volume =       "4",
  number =       "3",
  pages =        "26:1--26:??",
  month =        aug,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2000832.2000838",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Aug 30 08:13:57 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "26",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Holland:2011:AMM,
  author =       "Brian Holland and Alan D. George and Herman Lam and
                 Melissa C. Smith",
  title =        "An analytical model for multilevel performance
                 prediction of Multi-{FPGA} systems",
  journal =      j-TRETS,
  volume =       "4",
  number =       "3",
  pages =        "27:1--27:??",
  month =        aug,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2000832.2000839",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Aug 30 08:13:57 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "27",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Shannon:2011:LRH,
  author =       "Lesley Shannon and Paul Chow",
  title =        "Leveraging reconfigurability in the hardware\slash
                 software codesign process",
  journal =      j-TRETS,
  volume =       "4",
  number =       "3",
  pages =        "28:1--28:??",
  month =        aug,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2000832.2000840",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Aug 30 08:13:57 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "28",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Nava:2011:ADR,
  author =       "Federico Nava and Donatella Sciuto and Marco Domenico
                 Santambrogio and Stefan Herbrechtsmeier and Mario
                 Porrmann and Ulf Witkowski and Ulrich Rueckert",
  title =        "Applying dynamic reconfiguration in the mobile
                 robotics domain: a case study on computer vision
                 algorithms",
  journal =      j-TRETS,
  volume =       "4",
  number =       "3",
  pages =        "29:1--29:??",
  month =        aug,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2000832.2000841",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Aug 30 08:13:57 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "29",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Koehler:2011:PAB,
  author =       "Seth Koehler and Greg Stitt and Alan D. George",
  title =        "Platform-aware bottleneck detection for reconfigurable
                 computing applications",
  journal =      j-TRETS,
  volume =       "4",
  number =       "3",
  pages =        "30:1--30:??",
  month =        aug,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2000832.2000842",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Aug 30 08:13:57 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "30",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Cheung:2011:ISS,
  author =       "Peter Y. K. Cheung",
  title =        "Introduction to special section {FPGA 2009}",
  journal =      j-TRETS,
  volume =       "4",
  number =       "4",
  pages =        "31:1--31:??",
  month =        dec,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2068716.2068717",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Fri Mar 16 16:20:35 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "31",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Luu:2011:VFC,
  author =       "Jason Luu and Ian Kuon and Peter Jamieson and Ted
                 Campbell and Andy Ye and Wei Mark Fang and Kenneth Kent
                 and Jonathan Rose",
  title =        "{VPR 5.0}: {FPGA CAD} and architecture exploration
                 tools with single-driver routing, heterogeneity and
                 process scaling",
  journal =      j-TRETS,
  volume =       "4",
  number =       "4",
  pages =        "32:1--32:??",
  month =        dec,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2068716.2068718",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Fri Mar 16 16:20:35 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "The VPR toolset has been widely used in FPGA
                 architecture and CAD research, but has not evolved over
                 the past decade. This article describes and illustrates
                 the use of a new version of the toolset that includes
                 four new features: first, it supports a broad range of
                 single-driver routing architectures, which have
                 superior architectural and electrical properties over
                 the prior multidriver approach (and which is now
                 employed in the majority of FPGAs sold). Second, it can
                 now model, for placement and routing a heterogeneous
                 selection of hard logic blocks. This is a key (but not
                 final) step toward the incluion of blocks such as
                 memory and multipliers. Third, we provide optimized
                 electrical models for a wide range of architectures in
                 different process technologies, including a range of
                 area-delay trade-offs for each single architecture.
                 Finally, to maintain robustness and support future
                 development the release includes a set of regression
                 tests for the software. To illustrate the use of the
                 new features, we explore several architectural issues:
                 the FPGA area efficiency versus logic block
                 granularity, the effect of single-driver routing, and a
                 simple use of the heterogeneity to explore the impact
                 of hard multipliers on wiring track count.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "32",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Rubin:2011:CYO,
  author =       "Raphael Rubin and Andr{\'e} Dehon",
  title =        "Choose-your-own-adventure routing: Lightweight
                 load-time defect avoidance",
  journal =      j-TRETS,
  volume =       "4",
  number =       "4",
  pages =        "33:1--33:??",
  month =        dec,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2068716.2068719",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Fri Mar 16 16:20:35 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Aggressive scaling increases the number of devices we
                 can integrate per square millimeter but makes it
                 increasingly difficult to guarantee that each device
                 fabricated has the intended operational
                 characteristics. Without careful mitigation, component
                 yield rates will fall, potentially negating the
                 economic benefits of scaling. The fine-grained
                 reconfigurability inherent in FPGAs is a powerful tool
                 that can allow us to drop the stringent requirement
                 that every device be fabricated perfectly in order for
                 a component to be useful. To exploit inherent FPGA
                 reconfigurability while avoiding full CAD mapping, we
                 propose lightweight techniques compatible with the
                 current single bitstream model that can avoid defective
                 devices, reducing yield loss at high defect rates. In
                 particular, by embedding testing operations and
                 alternative path configurations into the bitstream,
                 each FPGA can avoid defects by making only simple,
                 greedy decisions at bitstream load time. With 20\%
                 additional tracks above the minimum routable channel
                 width, routes can tolerate 0.01\% switch and wire
                 defect rates, raising yield from essentially 0\% to
                 near 100\%.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "33",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Mishchenko:2011:SDC,
  author =       "Alan Mishchenko and Robert Brayton and Jie-Hong R.
                 Jiang and Stephen Jang",
  title =        "Scalable don't-care-based logic optimization and
                 resynthesis",
  journal =      j-TRETS,
  volume =       "4",
  number =       "4",
  pages =        "34:1--34:??",
  month =        dec,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2068716.2068720",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Fri Mar 16 16:20:35 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "We describe an optimization method for combinational
                 and sequential logic networks, with emphasis on
                 scalability. The proposed resynthesis (a) is capable of
                 substantial logic restructuring, (b) is customizable to
                 solve a variety of optimization tasks, and (c) has
                 reasonable runtime on industrial designs. The approach
                 uses don't-cares computed for a window surrounding a
                 node and can take into account external don't-cares
                 (e.g., unreachable states). It uses a SAT solver for
                 all aspects of Boolean manipulation: computing
                 don't-cares for a node in the window, and deriving a
                 new Boolean function of the node after resubstitution.
                 Experimental results on 6-input LUT networks after a
                 high effort synthesis show substantial reductions in
                 area and delay. When applied to 20 large academic
                 benchmarks, the LUT counts and logic levels are reduced
                 by 45.0\% and 12.2\%, respectively. The longest runtime
                 for synthesis and mapping is about two minutes. When
                 applied to a set of 14 industrial benchmarks ranging up
                 to 83K 6-LUTs, the LUT counts and logic levels are
                 reduced by 11.8\% and 16.5\%, respectively. The longest
                 runtime is about 30 minutes.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "34",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Kennings:2011:FTM,
  author =       "Andrew Kennings and Kristofer Vorwerk and Arun Kundu
                 and Val Pevzner and Andy Fox",
  title =        "{FPGA} technology mapping with encoded libraries and
                 staged priority cuts",
  journal =      j-TRETS,
  volume =       "4",
  number =       "4",
  pages =        "35:1--35:??",
  month =        dec,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2068716.2068721",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Fri Mar 16 16:20:35 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Technology mapping is an important step in the FPGA
                 CAD flow in which a network of simple gates is
                 converted into a network of logic blocks. This article
                 considers enhancements to a traditional LUT-based
                 mapping algorithm for an FPGA comprised of logic blocks
                 which implement only a subset of functions of up to k
                 variables; specifically, the logic block is a partial
                 LUT, but it possesses more inputs than a typical LUT.
                 An analysis of the logic block is presented, and
                 techniques for postmapping area recovery and
                 timing-driven buffer insertion are also described.
                 Numerical results are put forth which substantiate the
                 efficacy of the proposed methods using real circuits
                 mapped to a commercial FPGA architecture.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "35",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Papadimitriou:2011:PPR,
  author =       "Kyprianos Papadimitriou and Apostolos Dollas and Scott
                 Hauck",
  title =        "Performance of partial reconfiguration in {FPGA}
                 systems: a survey and a cost model",
  journal =      j-TRETS,
  volume =       "4",
  number =       "4",
  pages =        "36:1--36:??",
  month =        dec,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2068716.2068722",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Fri Mar 16 16:20:35 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Fine-grain reconfigurable devices suffer from the time
                 needed to load the configuration bitstream. Even for
                 small bitstreams in partially reconfigurable FPGAs this
                 time cannot be neglected. In this article we survey the
                 performance of the factors that contribute to the
                 reconfiguration speed. Then, we study an FPGA-based
                 system architecture and with real experiments we
                 produce a cost model of Partial Reconfiguration (PR).
                 This model is introduced to calculate the expected
                 reconfiguration time and throughput. In order to
                 develop a realistic model we take into account all the
                 physical components that participate in the
                 reconfiguration process. We analyze the parameters that
                 affect the generality of the model and the adjustments
                 needed per system for error-free evaluation. We verify
                 it with real measurements, and then we employ it to
                 evaluate existing systems presented in previous
                 publications. The percentage error of the cost model
                 when comparing its results with the actual values of
                 those publications varies from 36\% to 63\%, whereas
                 existing works report differences up to two orders of
                 magnitude. Present work enables a user to evaluate PR
                 and decide whether it is suitable for a certain
                 application prior entering the complex PR design
                 flow.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "36",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Chen:2011:EDL,
  author =       "Xiaoheng Chen and Venkatesh Akella",
  title =        "Exploiting data-level parallelism for energy-efficient
                 implementation of {LDPC} decoders and {DCT} on an
                 {FPGA}",
  journal =      j-TRETS,
  volume =       "4",
  number =       "4",
  pages =        "37:1--37:??",
  month =        dec,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2068716.2068723",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Fri Mar 16 16:20:35 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "We explore the use of Data-Level Parallelism (DLP) as
                 a way of improving the energy efficiency and power
                 consumption involved in running applications on an
                 FPGA. We show that static power consumption is a
                 significant fraction of the overall power consumption
                 in an FPGA and that it does not change significantly
                 even as the area required by an architecture increases,
                 because of the dominance of interconnect in an FPGA. We
                 show that the degree of DLP can be used in conjunction
                 with frequency scaling to reduce the overall power
                 consumption.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "37",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Easwaran:2011:NLB,
  author =       "Lakshmi Easwaran and Ali Akoglu",
  title =        "Net-length-based routability-driven power-aware
                 clustering",
  journal =      j-TRETS,
  volume =       "4",
  number =       "4",
  pages =        "38:1--38:??",
  month =        dec,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2068716.2068724",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Fri Mar 16 16:20:35 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "The state-of-the-art power-aware clustering tool,
                 P-T-VPack, achieves energy reduction by localizing nets
                 with high switching activity at the expense of channel
                 width and area. In this study, we employ predicted
                 individual postplacement net length information during
                 clustering and prioritize longer nets. This approach
                 targets the capacitance factor for energy reduction,
                 and prioritizes longer nets for channel width and area
                 reduction. We first introduce a new clustering
                 strategy, W-T-VPack, which replaces the switching
                 activity in P-T-VPack with a net length factor. We
                 obtain a 9.87\% energy reduction over T-VPack (3.78\%
                 increase over P-T-VPack), while at the same time
                 completely eliminating P-T-VPack's channel width and
                 area overhead. We then introduce W-P-T-VPack, which
                 combines switching activity and net length factors.
                 W-P-T-VPack achieves 14.26\% energy reduction (0.31\%
                 increase over P-T-VPack), while further improving
                 channel width by up to 12.87\% for different cluster
                 sizes. We investigate the energy performance of
                 routability (channel width)-driven clustering
                 algorithms, and show that W-T-VPack consistently
                 outperforms T-RPack and iRAC by at least 11.23\% and
                 9.07\%, respectively. We conclude that net-length-based
                 clustering is an effective method to concurrently
                 target energy and channel width.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "38",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Parandeh-Afshar:2011:CTS,
  author =       "Hadi Parandeh-Afshar and Arkosnato Neogy and Philip
                 Brisk and Paolo Ienne",
  title =        "Compressor tree synthesis on commercial
                 high-performance {FPGAs}",
  journal =      j-TRETS,
  volume =       "4",
  number =       "4",
  pages =        "39:1--39:??",
  month =        dec,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2068716.2068725",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Fri Mar 16 16:20:35 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Compressor trees are a class of circuits that
                 generalizes multioperand addition and the partial
                 product reduction trees of parallel multipliers using
                 carry-save arithmetic. Compressor trees naturally occur
                 in many DSP applications, such as FIR filters, and, in
                 the more general case, their use can be maximized
                 through the application of high-level transformations
                 to arithmetically intensive data flow graphs. Due to
                 the presence of carry-chains, it has long been thought
                 that trees of 2- or 3-input carry-propagate adders are
                 more efficient than compressor trees for FPGA
                 synthesis; however, this is not the case. This article
                 presents a heuristic for FPGA synthesis of compressor
                 trees that outperforms adder trees and exploits
                 carry-chains when possible. The experimental results
                 show that, on average, the use of compressor trees can
                 reduce critical path delay by 33\% and 45\%
                 respectively, compared to adder trees synthesized on
                 the Xilinx Virtex-5 and Altera Stratix III FPGAs.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "39",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Inoue:2011:TCD,
  author =       "Hiroaki Inoue and Junya Yamada and Hideyuki Yoneda and
                 Katsumi Togawa and Masato Motomura and Koichiro
                 Furuta",
  title =        "Test compression for dynamically reconfigurable
                 processors",
  journal =      j-TRETS,
  volume =       "4",
  number =       "4",
  pages =        "40:1--40:??",
  month =        dec,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2068716.2068726",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Fri Mar 16 16:20:35 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "We present the world's first test compression
                 technique that features automation of compression rules
                 for test time reduction on dynamically reconfigurable
                 processors. Evaluations on an actual 40-nm product show
                 that our technique achieves a 2.7 times compression
                 ratio for original configuration information (better
                 than does GZIP), the peak decompression bandwidth of
                 1.6 GB/s, and 2.7 times shorter test times.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "40",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Zick:2012:LCS,
  author =       "Kenneth M. Zick and John P. Hayes",
  title =        "Low-cost sensing with ring oscillator arrays for
                 healthier reconfigurable systems",
  journal =      j-TRETS,
  volume =       "5",
  number =       "1",
  pages =        "1:1--1:??",
  month =        mar,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2133352.2133353",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Mar 20 12:12:48 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Electronic systems on a chip increasingly suffer from
                 component variation, voltage noise, thermal hotspots,
                 and other subtle physical phenomena. Systems with
                 reconfigurability have unique opportunities for
                 adapting to such effects. Required, however, are
                 low-cost, fine-grained methods for sensing physical
                 parameters. This article presents powerful, novel
                 approaches to online sensing, including methods for
                 designing compact reconfigurable sensors, low-cost
                 threshold detection, and several enhanced measurement
                 procedures. Together, the approaches help enable
                 systems to autonomously uncover a wealth of physical
                 information. A highly efficient counter and improved
                 ring oscillator are introduced, enabling an entire
                 sensor node in just 8 Virtex-5 LUTs. We describe how
                 variations can be measured in delay, temperature,
                 switching-induced IR drop, and leakage-induced IR drop.
                 We demonstrate the proposed approach with an
                 experimental system based on a Virtex-5, instrumented
                 with over 100 sensors at an overhead of only 1.3\%.
                 Results from thermally controlled experiments provide
                 some surprising insights and illustrate the utility of
                 the approach. Online sensing can help open the door to
                 physically adaptive computing, including fine-grained
                 power, reliability, and health management schemes for
                 systems on a chip.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "1",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Michail:2012:EHT,
  author =       "Harris E. Michail and George S. Athanasiou and Vasilis
                 Kelefouras and George Theodoridis and Costas E.
                 Goutis",
  title =        "On the exploitation of a high-throughput {SHA-256
                 FPGA} design for {HMAC}",
  journal =      j-TRETS,
  volume =       "5",
  number =       "1",
  pages =        "2:1--2:??",
  month =        mar,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2133352.2133354",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Mar 20 12:12:48 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "High-throughput and area-efficient designs of hash
                 functions and corresponding mechanisms for Message
                 Authentication Codes (MACs) are in high demand due to
                 new security protocols that have arisen and call for
                 security services in every transmitted data packet. For
                 instance, IPv6 incorporates the IPSec protocol for
                 secure data transmission. However, the IPSec's
                 performance bottleneck is the HMAC mechanism which is
                 responsible for authenticating the transmitted data.
                 HMAC's performance bottleneck in its turn is the
                 underlying hash function. In this article a
                 high-throughput and small-size SHA-256 hash function
                 FPGA design and the corresponding HMAC FPGA design is
                 presented. Advanced optimization techniques have been
                 deployed leading to a SHA-256 hashing core which
                 performs more than 30\% better, compared to the next
                 better design. This improvement is achieved both in
                 terms of throughput as well as in terms of
                 throughput/area cost factor. It is the first reported
                 SHA-256 hashing core that exceeds 11Gbps (after place
                 and route in Xilinx Virtex 6 board).",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "2",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Olivares:2012:RAV,
  author =       "Joaqu{\'\i}n Olivares",
  title =        "Reconfigurable architecture for {VBSME} with variable
                 pixel precision",
  journal =      j-TRETS,
  volume =       "5",
  number =       "1",
  pages =        "3:1--3:??",
  month =        mar,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2133352.2133355",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Mar 20 12:12:48 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Current video coding standards, e.g. MPEG-4 H.264/AVC,
                 include Variable Block Size Motion Estimation, in this
                 paper, this process is implemented by a reconfigurable
                 architecture based on Signed Digit arithmetic. Bit
                 serial computation is applied to reconfigure pixel
                 precision. The reconfigurable architectural model is
                 extremely simple to reconfigure. Pixel truncation is
                 used to speed up computation saving up 23.5\% of clock
                 cycles for 4-bit precision. This design allows to
                 process all motion vectors of a block in just one
                 iteration. This system has been implemented in FPGA,
                 and HDTVp results are presented. Main characteristics,
                 of this architecture are: very reduced cost, high
                 performance, and reconfigurable pixel precision, these
                 features could be useful in mobile devices.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "3",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Siozios:2012:NFE,
  author =       "Kostas Siozios and Vasilis F. Pavlidis and Dimitrios
                 Soudris",
  title =        "A novel framework for exploring {$3$-D} {FPGAs} with
                 heterogeneous interconnect fabric",
  journal =      j-TRETS,
  volume =       "5",
  number =       "1",
  pages =        "4:1--4:??",
  month =        mar,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2133352.2133356",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Mar 20 12:12:48 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "A heterogeneous interconnect architecture can be a
                 useful approach for the design of 3-D FPGAs. A
                 methodology to investigate heterogeneous
                 interconnection schemes for 3-D FPGAs under different
                 3-D fabrication technologies is proposed. Application
                 of the proposed methodology on benchmark circuits
                 demonstrates an improvement in delay, power
                 consumption, and total wire-length of approximately
                 41\%, 32\%, and 36\%, respectively, as compared to 2-D
                 FPGAs. These improvements are additional to reducing
                 the number of interlayer connections. The fewer
                 interlayer connections are traded off for a higher
                 yield. An area model to evaluate this trade-off is
                 presented. Results indicate that a heterogeneous 3-D
                 FPGA requires 37\% less area as compared to a
                 homogeneous 3-D FPGA. Consequently, the heterogeneous
                 FPGAs can exhibit a higher manufacturing yield. A
                 design toolset is also developed to support the design
                 and exploration of various performance metrics for the
                 proposed 3-D FPGAs.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "4",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Takano:2012:DAA,
  author =       "Shigeyuki Takano",
  title =        "Design and analysis of adaptive processor",
  journal =      j-TRETS,
  volume =       "5",
  number =       "1",
  pages =        "5:1--5:??",
  month =        mar,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2133352.2133357",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Mar 20 12:12:48 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "A new computation model called CACHE (Cache
                 Architecture for Configurable Hardware Engine) is
                 proposed in this paper. This model does not require a
                 dedicated host processor and its software to harness
                 the reconfiguration. Autonomous reconfiguration is
                 performed within a working-set of application
                 datapaths. The CACHE model has lots of side effects;
                 caching, resource allocation and assignment, placement
                 and routing, and defragmentation, with a processing
                 array itself and a special register called a
                 working-set register file. The model aims to reduce
                 three major workloads: (1) the processor and
                 application design workload, (2) runtime resource
                 management and scheduling workload, and (3)
                 reconfiguration workload. In order to reduce these
                 workloads, processor architecture is definitely
                 different from traditional computing model and its
                 microprocessor architecture. There are three major
                 ideas to construct the computing system: (1) an on-chip
                 working-set model mainly in order to control load and
                 store of streams, namely to control traffics
                 introducing overheads, (2) an on-chip deadlock
                 properties model mainly in order to manage resources
                 and to continuously configure datapaths corresponding
                 to a working-set window, (3) a cache memory technique
                 to work for these models, the mechanism is equivalent
                 to the working-set window, and the cache memory's
                 procedure is equivalent to resource request,
                 acquirement, and release of deadlock properties. The
                 first model focuses onto streaming applications, for
                 example vector and matrix operations, filters, and so
                 on, which takes coarser grained operations such as
                 integer operations of C-language. Regarding performance
                 compared with DSPs, that comes from constant throughput
                 across different scale of the applications. In
                 addition, extended model, we call Instant model that
                 automatically generates instance of a datapath,
                 outperforms the DSPs. This paper shows its computation
                 model, architecture, low-level design, and analyses
                 about basic characteristics of the execution.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "5",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Zhang:2012:PSF,
  author =       "Wei Zhang and Vaughn Betz and Jonathan Rose",
  title =        "Portable and scalable {FPGA}-based acceleration of a
                 direct linear system solver",
  journal =      j-TRETS,
  volume =       "5",
  number =       "1",
  pages =        "6:1--6:??",
  month =        mar,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2133352.2133358",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Mar 20 12:12:48 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "FPGAs have the potential to serve as a platform for
                 accelerating many computations including scientific
                 applications. However, the large development cost and
                 short life span for FPGA designs have limited their
                 adoption by the scientific computing community.
                 FPGA-based scientific computing and many kinds of
                 embedded computing could become more practical if there
                 were hardware libraries that were portable to any
                 FPGA-based system with performance that scaled with the
                 size of the FPGA. To illustrate this idea we have
                 implemented one common super-computing library
                 function: the LU factorization method for solving
                 systems of linear equations. This paper describes a
                 method for making the design both portable and scalable
                 that should be illustrative if such libraries are to be
                 built in the future. The design is a software-based
                 generator that leverages both the flexibility of a
                 software programming language and the parameters
                 inherent in an hardware description language. The
                 generator accepts parameters that describe the FPGA
                 capacity and external memory capabilities. We compare
                 the performance of our engine executing on the largest
                 FPGA available at the time of this work (an Altera
                 Stratix III 3S340) to a single processor core
                 fabricated in the same 65nm IC process running a highly
                 optimized software implementation from the processor
                 vendor. For single precision matrices on the order of $
                 10, 000 \times 10, 000 $ elements, the FPGA
                 implementation is 2.2 times faster and the energy
                 dissipated per useful GFLOP operation is a factor of 5
                 times less. For double precision, the FPGA
                 implementation is 1.7 times faster and 3.5 times more
                 energy efficient.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "6",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Aggarwal:2012:SFT,
  author =       "Vikas Aggarwal and Greg Stitt and Alan George and
                 Changil Yoon",
  title =        "{SCF}: a Framework for Task-Level Coordination in
                 Reconfigurable, Heterogeneous Systems",
  journal =      j-TRETS,
  volume =       "5",
  number =       "2",
  pages =        "7:1--7:??",
  month =        jun,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2209285.2209286",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Nov 6 18:07:43 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Heterogeneous computing systems comprised of
                 accelerators such as FPGAs, GPUs, and manycore
                 processors coupled with standard microprocessors are
                 becoming an increasingly popular solution for future
                 computing systems due to their higher performance and
                 energy efficiency. Although programming languages and
                 tools are evolving to simplify device-level design,
                 programming such systems is still difficult and
                 time-consuming largely due to system-wide challenges
                 involving communication between heterogeneous devices,
                 which currently require ad hoc solutions. Most
                 communication frameworks and APIs which have dominated
                 parallel application development for decades were
                 developed for homogeneous systems, and hence cannot be
                 directly employed for hybrid systems. To solve this
                 problem, this article presents the System Coordination
                 Framework (SCF), which employs message passing to
                 transparently enable communication between tasks
                 described using different programming tools (and
                 languages), and running on heterogeneous processing
                 devices of systems from domains ranging from embedded
                 systems to High-Performance Computing (HPC) systems. By
                 hiding low-level architectural details of the
                 underlying communication from an application designer,
                 SCF can improve application development productivity,
                 provide higher levels of application portability, and
                 offer rapid design-space exploration of different
                 task/device mappings. In addition, SCF enables custom
                 communication synthesis that exploits mechanisms
                 specific to different devices and platforms, which can
                 provide performance improvements over generic solutions
                 employed previously. Our results indicate a performance
                 improvement of 28$ \times $ and 682$ \times $ by
                 employing FPGA devices for two applications presented
                 in this article, while simultaneously improving the
                 developer productivity by approximately 2.5 to 5 times
                 by using SCF.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "7",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Fekete:2012:DDR,
  author =       "S{\'a}ndor P. Fekete and Tom Kamphans and Nils Schweer
                 and Christopher Tessars and Jan C. van der Veen and
                 Josef Angermeier and Dirk Koch and J{\"u}rgen Teich",
  title =        "Dynamic Defragmentation of Reconfigurable Devices",
  journal =      j-TRETS,
  volume =       "5",
  number =       "2",
  pages =        "8:1--8:??",
  month =        jun,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2209285.2209287",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Nov 6 18:07:43 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "We propose a new method for defragmenting the module
                 layout of a reconfigurable device, enabled by a novel
                 approach for dealing with communication needs between
                 relocated modules and with inhomogeneities found in
                 commonly used FPGAs. Our method is based on dynamic
                 relocation of module positions during runtime, with
                 only very little reconfiguration overhead; the
                 objective is to maximize the length of contiguous free
                 space that is available for new modules. We describe a
                 number of algorithmic aspects of good defragmentation,
                 and present an optimization method based on tabu
                 search. Experimental results indicate that we can
                 improve the quality of module layout by roughly 50\%
                 over the static layout. Among other benefits, this
                 improvement avoids unnecessary rejections of modules.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "8",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Cheng:2012:STP,
  author =       "Lerong Cheng and Wenyao Xu and Fang Gong and Yan Lin
                 and Ho-Yan Wong and Lei He",
  title =        "Statistical Timing and Power Optimization of
                 Architecture and Device for {FPGAs}",
  journal =      j-TRETS,
  volume =       "5",
  number =       "2",
  pages =        "9:1--9:??",
  month =        jun,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2209285.2209288",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Nov 6 18:07:43 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Process variation in nanometer technology is becoming
                 an important issue for cutting-edge FPGAs with a
                 multimillion gate capacity. Considering both die-to-die
                 and within-die variations in effective channel length,
                 threshold voltage, and gate oxide thickness, we first
                 develop closed-form models of chip-level FPGA leakage
                 and timing variations. Experiments show that the mean
                 and standard deviation computed by our models are
                 within 3\% from those computed by Monte Carlo
                 simulation. We also observe that the leakage and timing
                 variations can be up to 3X and 1.9X, respectively. We
                 then derive analytical yield models considering both
                 leakage and timing variations, and use such models to
                 evaluate the performance of FPGA device and
                 architecture considering process variations. Compared
                 to the baseline, which uses the VPR architecture and
                 device setup based on the ITRS roadmap, device and
                 architecture tuning improves leakage yield by 10.4\%,
                 timing yield by 5.7\%, and leakage and timing combined
                 yield by 9.4\%. We also observe that LUT size of 4
                 gives the highest leakage yield, LUT size of 7 gives
                 the highest timing yield, but LUT size of 5 achieves
                 the maximum leakage and timing combined yield. To the
                 best of our knowledge, this is the first in-depth study
                 on FPGA architecture and device coevaluation
                 considering process variation.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "9",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Martin:2012:CPA,
  author =       "Kevin Martin and Christophe Wolinski and Krzysztof
                 Kuchcinski and Antoine Floch and Fran{\c{c}}ois
                 Charot",
  title =        "Constraint Programming Approach to Reconfigurable
                 Processor Extension Generation and Application
                 Compilation",
  journal =      j-TRETS,
  volume =       "5",
  number =       "2",
  pages =        "10:1--10:??",
  month =        jun,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2209285.2209289",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Nov 6 18:07:43 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "In this article, we present a constraint programming
                 approach for solving hard design problems present when
                 automatically designing specialized processor
                 extensions. Specifically, we discuss our approach for
                 automatic selection and synthesis of processor
                 extensions as well as efficient application compilation
                 for these newly generated extensions. The discussed
                 approach is implemented in our integrated design
                 framework, IFPEC, built using Constraint Programming
                 (CP). In our framework, custom instructions,
                 implemented as processor extensions, are defined as
                 computational patterns and represented as graphs. This,
                 along with the graph representation of an application,
                 provides a way to use our CP framework equipped with
                 subgraph isomorphism and connected component
                 constraints for identification of processor extensions
                 as well as their selection, application scheduling,
                 binding, and routing. All design steps assume
                 architectures composed of runtime reconfigurable cells,
                 implementing selected extensions, tightly connected to
                 a processor. An advantage of our approach is the
                 possibility of combining different heterogeneous
                 constraints to represent and solve all our design
                 problems. Moreover, the flexibility and expressiveness
                 of the CP framework makes it possible to solve
                 simultaneously extension selection, application
                 scheduling, and binding and improve the quality of the
                 generated results. The article is largely illustrated
                 with experimental results.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "10",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Hubner:2012:ISI,
  author =       "Michael H{\"u}bner",
  title =        "Introduction to the Special Issue on {ReCoSoC 2011}",
  journal =      j-TRETS,
  volume =       "5",
  number =       "3",
  pages =        "11:1--11:??",
  month =        oct,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2362374.2362375",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Nov 6 18:07:44 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "11",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Shield:2012:ACC,
  author =       "John Shield and Jean-Philippe Diguet and Guy Gogniat",
  title =        "Asymmetric Cache Coherency: Policy Modifications to
                 Improve Multicore Performance",
  journal =      j-TRETS,
  volume =       "5",
  number =       "3",
  pages =        "12:1--12:??",
  month =        oct,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2362374.2362376",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Nov 6 18:07:44 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Asymmetric coherency is a new optimization method for
                 coherency policies to support nonuniform workloads in
                 multicore processors. Asymmetric coherency assists in
                 load balancing a workload and this is applicable to SoC
                 multicores where the applications are not evenly spread
                 among the processors and customization of the coherency
                 is possible. Asymmetric coherency is a policy change,
                 and consequently our designs require little or no
                 additional hardware over an existing system. We explore
                 two different types of asymmetric coherency policies.
                 Our bus-based asymmetric coherency policy, generated a
                 60\% coherency cost reduction (reduction of latencies
                 due to coherency messages) for nonshared data. Our
                 directory-based asymmetric coherency policy, showed up
                 to a 5.8\% execution time improvement and up to a 22\%
                 improvement in average memory latency for the parallel
                 benchmarks Sha, using a statically allocated asymmetry.
                 Dynamically allocated asymmetry was found to generate
                 further improvements in access latency, increasing the
                 effectiveness of asymmetric coherency by up to 73.8\%
                 when compared to the static asymmetric solution.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "12",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Thielmann:2012:MLH,
  author =       "Benjamin Thielmann and Jens Huthmann and Andreas
                 Koch",
  title =        "Memory Latency Hiding by Load Value Speculation for
                 Reconfigurable Computers",
  journal =      j-TRETS,
  volume =       "5",
  number =       "3",
  pages =        "13:1--13:??",
  month =        oct,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2362374.2362377",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Nov 6 18:07:44 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Load value speculation has long been proposed as a
                 method to hide the latency of memory accesses. It has
                 seen very limited use in actual processors, often due
                 to the high overhead of reexecuting misspeculated
                 computations. We present PreCoRe, a framework capable
                 of generating application-specific microarchitectures
                 supporting load value speculation on reconfigurable
                 computers. The article examines the lightweight
                 speculation and replay mechanisms, the architecture of
                 the actual data value prediction units as well as the
                 impact on the nonspeculative parts of the memory
                 system. In experiments, using PreCoRe has achieved
                 speedups of up to 2.48 times over nonspeculative
                 implementations.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "13",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Gantel:2012:ERP,
  author =       "Laurent Gantel and Amel Khiar and Benoit Miramond and
                 Mohamed El Amine Benkhelifa and Lounis Kessal and
                 Fabrice Lemonnier and Jimmy Le Rhun",
  title =        "Enhancing Reconfigurable Platforms Programmability for
                 Synchronous Data-Flow Applications",
  journal =      j-TRETS,
  volume =       "5",
  number =       "3",
  pages =        "14:1--14:??",
  month =        oct,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2362374.2362378",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Nov 6 18:07:44 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Recent FPGAs allow the design of efficient and complex
                 Heterogeneous Systems-on-Chip (HSoC). Namely, these
                 systems are composed of several processors, hardware
                 accelerators as well as communication media between all
                 these components. Performances provided by HSoCs make
                 them really interesting for data-flow applications,
                 especially image processing applications. The use of
                 this kind of architecture provides good performances
                 but the drawback is an increase of the programming
                 complexity. This complexity is due to the heterogeneous
                 deployment of the application on the platform. Some
                 functions are implemented in software to run on a
                 processor, whereas other functions are implemented in
                 hardware to run in a reconfigurable partition of the
                 FPGA. This article aims to define a programming model
                 based on the Synchronous Data-Flow model, in order to
                 abstract the heterogeneity of the implementation and to
                 leverage the communication issue between software and
                 hardware actors.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "14",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Lusala:2012:STB,
  author =       "Angelo Kuti Lusala and Jean-Didier Legat",
  title =        "A {SDM--TDM}-Based Circuit-Switched Router for On-Chip
                 Networks",
  journal =      j-TRETS,
  volume =       "5",
  number =       "3",
  pages =        "15:1--15:??",
  month =        oct,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2362374.2362379",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Nov 6 18:07:44 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "This article proposes a circuit-switched router that
                 combines Spatial Division Multiplexing (SDM) and Time
                 Division Multiplexing (TDM) in order to increase path
                 diversity in the router while sharing channels among
                 multiple connections. In this way, the probability of
                 establishing paths through the network is increased,
                 thereby significantly reducing contention in the
                 network. Furthermore, Quality of Service (QoS) is
                 easily guaranteed. The proposed router was synthesized
                 on an Stratix III 3SL340F FPGA device. A 4 $ \times $ 4
                 2D Mesh SDM-TDM Network-on-Chip (NoC) was built with
                 the proposed router and synthesized on the 3SL340F FPGA
                 device. The 4 $ \times $ 4 2D Mesh SDM-TDM NoC was used
                 to build on an FPGA device, a Multiprocessor
                 System-on-Chip (MPSoC) platform consisted of 16 Nios
                 II/f processors, 16 20-KB On-chip Memories, and 16
                 Network Interfaces. Synthesis results of the MPSoC
                 platform show that the proposed router architecture can
                 be used to built large practicable MPSoC platforms with
                 the proposed NoC architecture with a reasonable
                 hardware overhead and appreciable clock frequency.
                 Simulation results show that combining SDM and TDM
                 techniques in a router allows the highest probability
                 of establishing paths through the network.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "15",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Gaspar:2012:SEF,
  author =       "Lubos Gaspar and Viktor Fischer and Lilian Bossuet and
                 Robert Fouquet",
  title =        "Secure Extension of {FPGA} General Purpose Processors
                 for Symmetric Key Cryptography with Partial
                 Reconfiguration Capabilities",
  journal =      j-TRETS,
  volume =       "5",
  number =       "3",
  pages =        "16:1--16:??",
  month =        oct,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2362374.2362380",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Nov 6 18:07:44 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "In data security systems, general purpose processors
                 (GPPs) are often extended by a cryptographic
                 accelerator. The article presents three ways of
                 extending GPPs for symmetric key cryptography
                 applications. Proposed extensions guarantee secure key
                 storage and management even if the system is facing
                 protocol, software and cache memory attacks. The system
                 is partitioned into processor, cipher, and key memory
                 zones. The three security zones are separated at
                 protocol, system, architecture and physical levels. The
                 proposed principle was validated on Altera NIOS II,
                 Xilinx MicroBlaze and Microsemi Cortex M1 soft-core
                 processor extensions. We show that stringent separation
                 of the cipher zone is helpful for partial
                 reconfiguration of the security module, if the
                 enciphering algorithm needs to be dynamically changed.
                 However, the key zone including reconfiguration
                 controller must remain static in order to maintain the
                 high level of security required. We demonstrate that
                 the principle is feasible in partially reconfigurable
                 field programmable gate arrays (FPGAs) such as Altera
                 Stratix V or Xilinx Virtex 6 and also to some extent in
                 FPGAs featuring hardwired general purpose processors
                 such as Cortex M3 in Microsemi SmartFusion FPGA.
                 Although the three GPPs feature different data
                 interfaces, we show that the processors with their
                 extensions reach the required high security level while
                 maintaining partial reconfiguration capability.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "16",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Ost:2012:EAT,
  author =       "Luciano Ost and Sameer Varyani and Leandro Soares
                 Indrusiak and Marcelo Mandelli and Gabriel Marchesan
                 Almeida and Eduardo Wachter and Fernando Moraes and
                 Gilles Sassatelli",
  title =        "Enabling Adaptive Techniques in Heterogeneous {MPSoCs}
                 Based on Virtualization",
  journal =      j-TRETS,
  volume =       "5",
  number =       "3",
  pages =        "17:1--17:??",
  month =        oct,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2362374.2362381",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Nov 6 18:07:44 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "This article explores the use of virtualization to
                 enable mechanisms like task migration and dynamic
                 mapping in heterogeneous MPSoCs, thereby targeting the
                 design of systems capable of adapt their behavior to
                 time-changing workloads. Because tasks may have to be
                 mapped to target processors with different instruction
                 set architectures, we propose the use of Low Level
                 Virtual Machine (LLVM) to postcompile the tasks at
                 runtime depending on their target processor. A novel
                 dynamic mapping heuristic is also proposed, aiming to
                 exploit the advantages of specialized processors while
                 taking into account the overheads imposed by
                 virtualization. Extensive experimental work at
                 different levels of abstraction---FPGA prototype, RTL
                 and system-level simulation---is presented to evaluate
                 the proposed techniques.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "17",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Morgan:2012:RFL,
  author =       "Fearghal Morgan and Seamus Cawley and David Newell",
  title =        "Remote {FPGA} Lab for Enhancing Learning of Digital
                 Systems",
  journal =      j-TRETS,
  volume =       "5",
  number =       "3",
  pages =        "18:1--18:??",
  month =        oct,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2362374.2362382",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Nov 6 18:07:44 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Learning in digital systems can be enhanced through
                 applying a learn-by-doing approach on practical
                 hardware systems and by using Web-based technology to
                 visualize and animate hardware behavior. The authors
                 have reported the Web-based Remote FPGA Lab (RFL) which
                 provides a novel, real-time control and visualization
                 interface to a remote, always-on FPGA hardware
                 implementation. The RFL helps students to understand
                 and reason about digital systems operation, using
                 interactive animation of signal behavior in an
                 executing digital logic system, at any level of the
                 design hierarchy. The RFL supports the creation of
                 real-time interactive digital systems teaching demos.
                 The article presents student RFL usage data and survey
                 data which highlight improved student engagement,
                 learning and achievement. The article describes the RFL
                 architecture, communication interface, Web page
                 functionality, user access administration and database
                 management. The article also describes the RFLGen
                 program, developed to automate user design integration
                 into the Xilinx ISE VHDL-based RFL project wrapper for
                 creation of FPGA configuration bitstreams and RFL
                 animations.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "18",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Krieg:2012:PMP,
  author =       "Armin Krieg and Johannes Grinschgl and Christian
                 Steger and Reinhold Weiss and Holger Bock and Josef
                 Haid",
  title =        "{POWER-MODES: POWer-EmulatoR- and MOdel-Based
                 DEpendability and Security Evaluations}",
  journal =      j-TRETS,
  volume =       "5",
  number =       "4",
  pages =        "19:1--19:??",
  month =        dec,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2392616.2392617",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sun May 5 09:22:43 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Innovation cycles have been shortening significantly
                 during the last years. This process puts tremendous
                 pressure on designers of embedded systems for
                 security-or reliability-critical applications. Eventual
                 design problems not detected during design time can
                 lead to lost money, confidentiality, or even loss of
                 life in extreme cases. Therefore it is of vital
                 importance to evaluate a new system for its robustness
                 against intentionally and random induced operational
                 faults. Currently this is generally done using
                 extensive simulation runs using gate-level models or
                 direct measurements on the finished silicon product.
                 These approaches either need a significant amount of
                 time and computational power for these simulations or
                 rely on existing product samples. This article presents
                 a novel system evaluation platform using power
                 emulation and fault injection techniques to provide an
                 additional tool for developers of embedded systems in
                 security-and reliability-critical fields. Faults are
                 emulated using state-of-the-art fault injection methods
                 and a flexible pattern representation approach. The
                 resulting effects of these faults on the power
                 consumption profile are estimated using
                 state-of-the-art power emulation hardware. A modular
                 system augmentation approach provides emulation
                 flexibility similar to fault simulation
                 implementations. The platform enables the efficient
                 evaluation of new hardware or software implementations
                 of critical security or reliability solutions at an
                 early development phase.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "19",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Nabina:2012:AVS,
  author =       "Atukem Nabina and Jose Luis Nunez-Yanez",
  title =        "Adaptive Voltage Scaling in a Dynamically
                 Reconfigurable {FPGA}-Based Platform",
  journal =      j-TRETS,
  volume =       "5",
  number =       "4",
  pages =        "20:1--20:??",
  month =        dec,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2392616.2392618",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sun May 5 09:22:43 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Power is an important issue limiting the applicability
                 of Field Programmable Gate Arrays (FPGAs) since it is
                 considered to be up to one order of magnitude higher
                 than in ASICs. Recently, dynamic reconfiguration in
                 FPGAs has emerged as a viable technique able to achieve
                 power and cost reductions by time-multiplexing the
                 required functionality at runtime. In this article, the
                 applicability of Adaptive Voltage Scaling (AVS) to
                 FPGAs is considered together with dynamic
                 reconfiguration of logic and clock management resources
                 to further improve the power profile of these devices.
                 AVS is a popular power-saving technique in ASICs that
                 enables a device to regulate its own voltage and
                 frequency based on workload, fabrication, and operating
                 conditions. The resulting processing platform exploits
                 the available application-dependent timing margins to
                 achieve a power reduction up to 85\% operating at 0.58
                 volts compared with operating at a nominal voltage of 1
                 volt. The results also show that the energy
                 requirements at 0.58 volts are approximately five times
                 lower compared with nominal voltage and this can be
                 explained by the approximate cubic relation of static
                 energy with voltage and the fact that the static
                 component dominates power consumption in the considered
                 FPGA devices.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "20",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Jacobs:2012:RFT,
  author =       "Adam Jacobs and Grzegorz Cieslewski and Alan D. George
                 and Ann Gordon-Ross and Herman Lam",
  title =        "Reconfigurable Fault Tolerance: a Comprehensive
                 Framework for Reliable and Adaptive {FPGA}-Based Space
                 Computing",
  journal =      j-TRETS,
  volume =       "5",
  number =       "4",
  pages =        "21:1--21:??",
  month =        dec,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2392616.2392619",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sun May 5 09:22:43 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Commercial SRAM-based, field-programmable gate arrays
                 (FPGAs) have the potential to provide space
                 applications with the necessary performance to meet
                 next-generation mission requirements. However,
                 mitigating an FPGA's susceptibility to single-event
                 upset (SEU) radiation is challenging. Triple-modular
                 redundancy (TMR) techniques are traditionally used to
                 mitigate radiation effects, but TMR incurs substantial
                 overheads such as increased area and power
                 requirements. In order to reduce these overheads while
                 still providing sufficient radiation mitigation, we
                 propose a reconfigurable fault tolerance (RFT)
                 framework that enables system designers to dynamically
                 adjust a system's level of redundancy and fault
                 mitigation based on the varying radiation incurred at
                 different orbital positions. This framework includes an
                 adaptive hardware architecture that leverages FPGA
                 reconfigurable techniques to enable significant
                 processing to be performed efficiently and reliably
                 when environmental factors permit. To accurately
                 estimate upset rates, we propose an upset rate modeling
                 tool that captures time-varying radiation effects for
                 arbitrary satellite orbits using a collection of
                 existing, publicly available tools and models. We
                 perform fault-injection testing on a prototype RFT
                 platform to validate the RFT architecture and RFT
                 performability models. We combine our RFT hardware
                 architecture and the modeled upset rates using
                 phased-mission Markov modeling to estimate
                 performability gains achievable using our framework for
                 two case-study orbits.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "21",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Cancare:2012:EHC,
  author =       "Fabio Cancare and Davide B. Bartolini and Matteo
                 Carminati and Donatella Sciuto and Marco D.
                 Santambrogio",
  title =        "On the Evolution of Hardware Circuits via
                 Reconfigurable Architectures",
  journal =      j-TRETS,
  volume =       "5",
  number =       "4",
  pages =        "22:1--22:??",
  month =        dec,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2392616.2392620",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sun May 5 09:22:43 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Traditionally, hardware circuits are realized
                 according to techniques that follow the classical
                 phases of design and testing. A completely new approach
                 in the creation of hardware circuits has been
                 proposed---the Evolvable Hardware (EHW) paradigm, which
                 bases the circuit synthesis on a goal-oriented
                 evolutionary process inspired by biological evolution
                 in Nature. FPGA-based approaches have emerged as the
                 main architectural solution to implement EHW systems.
                 Various EHW systems have been proposed by researchers
                 but most of them, being based on outdated chips, do not
                 take advantage of the interesting features introduced
                 in newer FPGAs. This article describes a project named
                 Hardware Evolution over Reconfigurable Architectures
                 (HERA), which aims at creating a complete and
                 performance-oriented framework for the evolution of
                 digital circuits, leveraging the reconfiguration
                 technology available in FPGAs. The project is described
                 from its birth to its current state, presenting its
                 evolutionary technique tailored for FPGA-based circuits
                 and the most recent enhancements to improve the
                 scalability with respect to problem size. The developed
                 EHW system outperforms the state of the art, proving
                 its effectiveness in evolving both standard benchmarks
                 and more complex real-world applications.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "22",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Ould-Bachir:2013:SAS,
  author =       "Tarek Ould-Bachir and Jean Pierre David",
  title =        "Self-Alignment Schemes for the Implementation of
                 Addition-Related Floating-Point Operators",
  journal =      j-TRETS,
  volume =       "6",
  number =       "1",
  pages =        "1:1--1:??",
  month =        may,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2457443.2457444",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Thu Mar 13 08:09:42 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/fparith.bib;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Advances in semiconductor technology brings to the
                 market incredibly dense devices, capable of handling
                 tens to hundreds floating-point operators on a single
                 chip; so do the latest field programmable gate arrays
                 (FPGAs). In order to alleviate the complexity of
                 resorting to these devices in computationally intensive
                 applications, this article proposes hardware schemes
                 for the realization of addition-related floating-point
                 operators based on the self-alignment technique (SAT).
                 The article demonstrates that the schemes guarantee an
                 accuracy as if summation was computed accurately in the
                 precision of operator's internal mantissa, then
                 faithfully rounded to working precision. To achieve
                 such performance, the article adopts the redundant high
                 radix carry-save (HRCS) format for the rapid addition
                 of wide mantissas. Implementation results show that
                 combining the SAT and the HRCS format allows the
                 implementation of complex operators with reduced area
                 and latency, more so when a fused-path approach is
                 adopted. The article also proposes a new hardware
                 operator for performing endomorphic HRCS additions and
                 presents a new technique for speeding up the conversion
                 from the redundant HRCS to a conventional binary
                 format.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "1",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Zhang:2013:FBA,
  author =       "Yan Zhang and Fan Zhang and Zheming Jin and Jason D.
                 Bakos",
  title =        "An {FPGA-Based} Accelerator for Frequent Itemset
                 Mining",
  journal =      j-TRETS,
  volume =       "6",
  number =       "1",
  pages =        "2:1--2:??",
  month =        may,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2457443.2457445",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Thu Mar 13 08:09:42 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/datacompression.bib;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "In this article we describe a Field Programmable Gate
                 Array (FPGA)-based coprocessor architecture for
                 Frequent Itemset Mining (FIM). FIM is a common data
                 mining task used to find frequently occurring subsets
                 amongst a database of sets. FIM is a nonnumerical, data
                 intensive computation and is used in machine learning
                 and computational biology. FIM is particularly
                 expensive---in terms of execution time and
                 memory---when performed on large and/or sparse
                 databases or when applied using a low appearance
                 frequency threshold. Because of this, the development
                 of increasingly efficient FIM algorithms and their
                 mapping to parallel architectures is an active field.
                 Previous attempts to accelerate FIM using FPGAs have
                 relied on performance-limiting strategies such as
                 iterative database loading and runtime logic unit
                 reconfiguration. In this article, we present a novel
                 architecture to implement Eclat, a well-known FIM
                 algorithm. Unlike previous efforts, our technique does
                 not impose limits on the maximum set size as a function
                 of available FPGA logic resources and our design scales
                 well to multiple FPGAs. In addition to a novel hardware
                 design, we also present a corresponding compression
                 scheme for intermediate results that are stored in
                 on-chip memory. On a four-FPGA board, experimental
                 results show up to 68X speedup compared to a highly
                 optimized software implementation.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "2",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Meeuws:2013:QSM,
  author =       "Roel Meeuws and S. Arash Ostadzadeh and Carlo Galuzzi
                 and Vlad Mihai Sima and Razvan Nane and Koen Bertels",
  title =        "{Quipu}: a Statistical Model for Predicting Hardware
                 Resources",
  journal =      j-TRETS,
  volume =       "6",
  number =       "1",
  pages =        "3:1--3:??",
  month =        may,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2457443.2457446",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Thu Mar 13 08:09:42 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "There has been a steady increase in the utilization of
                 heterogeneous architectures to tackle the growing need
                 for computing performance and low-power systems. The
                 execution of computation-intensive functions on
                 specialized hardware enables to achieve substantial
                 speedups and power savings. However, with a large
                 legacy code base and software engineering experts, it
                 is not at all obvious how to easily utilize these new
                 architectures. As a result, there is a need for
                 comprehensive tool support to bridge the knowledge gap
                 of many engineers as well as to retarget legacy code.
                 In this article, we present the Quipu modeling
                 approach, which consists of a set of tools and a
                 modeling methodology that can generate hardware
                 estimation models, which provide valuable information
                 for developers. This information helps to focus their
                 efforts, to partition their application, and to select
                 the right heterogeneous components. We present Quipu 's
                 capability to generate domain-specific models, that are
                 up to several times more accurate within their
                 particular domain (error: 4.6\%) as compared to
                 domain-agnostic models (error: 23\%). Finally, we show
                 how Quipu can generate models for a new toolchain and
                 platform within a few days.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "3",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{deDinechin:2013:FPE,
  author =       "Florent de Dinechin and Pedro Echeverr{\'\i}a and
                 Marisa L{\'o}pez-Vallejo and Bogdan Pasca",
  title =        "Floating-Point Exponentiation Units for Reconfigurable
                 Computing",
  journal =      j-TRETS,
  volume =       "6",
  number =       "1",
  pages =        "4:1--4:??",
  month =        may,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2457443.2457447",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Thu Mar 13 08:09:42 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/fparith.bib;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "The high performance and capacity of current FPGAs
                 makes them suitable as acceleration co-processors. This
                 article studies the implementation, for such
                 accelerators, of the floating-point power function $
                 x^y $ as defined by the C99 and IEEE 754-2008
                 standards, generalized here to arbitrary exponent and
                 mantissa sizes. Last-bit accuracy at the smallest
                 possible cost is obtained thanks to a careful study of
                 the various subcomponents: a floating-point logarithm,
                 a modified floating-point exponential, and a truncated
                 floating-point multiplier. A parameterized architecture
                 generator in the open-source FloPoCo project is
                 presented in details and evaluated.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "4",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Neely:2013:RTH,
  author =       "Christopher E. Neely and Gordon Brebner and Weijia
                 Shang",
  title =        "{ReShape}: Towards a High-Level Approach to Design and
                 Operation of Modular Reconfigurable Systems",
  journal =      j-TRETS,
  volume =       "6",
  number =       "1",
  pages =        "5:1--5:??",
  month =        may,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2457443.2457448",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Thu Mar 13 08:09:42 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "The latest FPGA devices provide the headroom to
                 implement large-scale and complex systems. A key
                 requirement is the integration of modules from diverse
                 sources to promote modular design and reuse. A contrary
                 factor is that using dynamic partial reconfiguration
                 typically requires low-level planning of the system
                 implementation. In this article, we introduce ReShape:
                 a high-level approach for designing reconfigurable
                 systems by interconnecting modules, which gives a
                 ``plug and play'' look and feel, is supported by tools
                 that carry out implementation functions, and is carried
                 through to support system reconfiguration during
                 operation. The emphasis is on the inter-module
                 connections and abstracting the communication patterns
                 that are typical between modules: for example, the
                 streaming of data, or the reading and writing of data
                 to and from memory modules. The details of wiring and
                 signaling are hidden from view, via metadata associated
                 with individual modules. This setting allows system
                 reconfiguration at the module level, both by supporting
                 type checking of replacement modules and by managing
                 the overall system implementation, via metadata
                 associated with its FPGA floorplan. The methodology and
                 tools have been implemented in a prototype targeted to
                 a domain-specific setting---high-speed networking---and
                 have been validated on real telecommunications design
                 projects.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "5",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Goehringer:2013:ISS,
  author =       "Diana Goehringer and Ren{\'e} Cumplido",
  title =        "Introduction to the special section on {19th
                 Reconfigurable Architectures Workshop (RAW 2012)}",
  journal =      j-TRETS,
  volume =       "6",
  number =       "2",
  pages =        "6:1--6:??",
  month =        jul,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2499625.2499626",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Thu Mar 13 08:09:43 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "6",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Sidiropoulos:2013:JFS,
  author =       "Harry Sidiropoulos and Kostas Siozios and Peter Figuli
                 and Dimitrios Soudris and Michael H{\"u}bner and
                 J{\"u}rgen Becker",
  title =        "{JITPR}: a framework for supporting fast application's
                 implementation onto {FPGAs}",
  journal =      j-TRETS,
  volume =       "6",
  number =       "2",
  pages =        "7:1--7:??",
  month =        jul,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2492185",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Thu Mar 13 08:09:43 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "The execution runtime usually is a headache for
                 designers performing application mapping onto
                 reconfigurable architectures. In this article we
                 propose a methodology, as well as the supporting
                 toolset, targeting to provide fast application
                 implementation onto reconfigurable architectures with
                 the usage of a Just-In-Time (JIT) compilation
                 framework. Experimental results prove the efficiency of
                 the introduced framework, as we reduce the execution
                 runtime compared to the state-of-the-art approach on
                 average by 53.5$ \times $. Additionally, the derived
                 solutions achieve higher operation frequencies by 1.17$
                 \times $, while they also exhibit significant lower
                 fragmentation ratios of hardware resources.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "7",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Heisswolf:2013:VND,
  author =       "Jan Heisswolf and Aurang Zaib and Andreas
                 Weichslgartner and Ralf K{\"o}nig and Thomas Wild and
                 J{\"u}rgen Teich and Andreas Herkersdorf and J{\"u}rgen
                 Becker",
  title =        "Virtual networks --- distributed communication
                 resource management",
  journal =      j-TRETS,
  volume =       "6",
  number =       "2",
  pages =        "8:1--8:??",
  month =        jul,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2492186",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Thu Mar 13 08:09:43 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Networks-on-Chip (NoC) enable scalability for future
                 manycore architectures, facilitating parallel
                 communication between multiple cores. Applications
                 running in parallel on a NoC-based architecture can
                 affect each other due to overlapping communication.
                 Quality-of-Service (QoS) must be supported by the
                 communication infrastructure to execute communication-,
                 real-time- and safety-critical applications on such an
                 architecture. Different strategies have been proposed
                 to provide QoS for point-to-point connections. These
                 strategies allow each node to set up a limited number
                 of connections to other nodes. In this work Virtual
                 Networks (VN) are proposed to enable QoS for regions of
                 a NoC-based architecture. Virtual Networks overcome the
                 limitation of point-to-point connections. A VN behaves
                 like an exclusive physical network. Virtual Networks
                 can be defined and configured during runtime. The size
                 of the VN region and the assigned bandwidth can be
                 adjusted depending on the application requirements.
                 Virtual Networks enable the decoupling of local from
                 global communication. Therefore, the communication of
                 the application mapped into the region is assigned to a
                 Virtual Network established in that specific region.
                 This concept targets packet-switched networks with
                 virtual channels and is realized by an intelligent
                 hardware unit that manages the virtual channel
                 reservation process at system runtime. Virtual Networks
                 can be established and administrated independent of
                 each other, enabling distributed communication resource
                 management. The proposed concept is implemented as a
                 cycle-accurate SystemC simulation model. The simulation
                 results of executing communicating graphs obtained from
                 real application highlight the usefulness of Virtual
                 Networks by showing improved throughput and reduced
                 delay in the respective scenarios. A hardware
                 implementation demonstrates a low impact on area
                 utilization and power consumption.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "8",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Ganegedara:2013:CPA,
  author =       "Thilan Ganegedara and Viktor Prasanna",
  title =        "A comprehensive performance analysis of virtual
                 routers on {FPGA}",
  journal =      j-TRETS,
  volume =       "6",
  number =       "2",
  pages =        "9:1--9:??",
  month =        jul,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2492187",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Thu Mar 13 08:09:43 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "Network virtualization has gained much popularity with
                 the advent of datacenter networking. The hardware
                 aspect of network virtualization, router
                 virtualization, allows network service providers to
                 consolidate network hardware, reducing equipment cost
                 and management overhead. Several approaches have been
                 proposed to achieve router virtualization to support
                 several virtual networks on a single hardware platform.
                 However, their performance has not been analyzed
                 quantitatively to understand the benefits of each
                 approach. In this work, we perform a comprehensive
                 analysis of performance of these approaches on Field
                 Programmable Gate Array (FPGA) with respect to memory
                 consumption, throughput, and power consumption.
                 Generalized versions of virtualization approaches are
                 evaluated based on post place-and-route results on a
                 state-of-the-art FPGA. Grouping of routing tables is
                 proposed as a novel approach to improve scalability
                 (i.e., the number of virtual networks hosted on a
                 single chip) of virtual routers on FPGA with respect to
                 memory requirement. Further, we employ floor-planning
                 techniques to efficiently utilize chip resources and
                 achieve high performance for virtualized, pipelined
                 architectures, resulting in 1.6$ \times $ speedup on
                 the average compared with the non-floor-planned
                 approach. The results indicate that the proposed
                 solution is able to support 100+ and 50 virtual routers
                 per chip in the near-best and near-worst case
                 scenarios, while operating at 20+ Gbps rates.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "9",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Das:2013:TDA,
  author =       "Joydip Das and Steven J. E. Wilton",
  title =        "Towards development of an analytical model relating
                 {FPGA} architecture parameters to routability",
  journal =      j-TRETS,
  volume =       "6",
  number =       "2",
  pages =        "10:1--10:??",
  month =        jul,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2499625.2499627",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Thu Mar 13 08:09:43 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "We present an analytical model relating FPGA
                 architectural parameters to the routability of the
                 FPGA. The inputs to the model include the channel width
                 and the connection and the switch block flexibilities.
                 The output is an estimate of the proportion of nets in
                 a large circuit that can be expected to be successfully
                 routed on the FPGA. We assume that the circuit is
                 routed to the FPGA using a single-step combined
                 global/detailed router. We show that the model
                 correctly predicts routability trends. We also present
                 an example application to demonstrate that this model
                 may be a valuable tool for FPGA architects. When
                 combined with the earlier works on analytical modeling,
                 our model can be used to quickly predict the
                 routability without going through any stage of an
                 expensive CAD flow. We envisage that this model will
                 benefit FPGA architecture designers and vendors to
                 quickly evaluate FPGA routing fabrics.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "10",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Huang:2013:VHS,
  author =       "Chun-Hsian Huang and Pao-Ann Hsiung",
  title =        "Virtualizable hardware\slash software design
                 infrastructure for dynamically partially reconfigurable
                 systems",
  journal =      j-TRETS,
  volume =       "6",
  number =       "2",
  pages =        "11:1--11:??",
  month =        jul,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2499625.2499628",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Thu Mar 13 08:09:43 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "In most existing works, reconfigurable hardware
                 modules are still managed as conventional hardware
                 devices. Further, the software reconfiguration overhead
                 incurred by loading corresponding device drivers into
                 the kernel of an operating system has been overlooked
                 until now. As a result, the enhancement of system
                 performance and the utilization of reconfigurable
                 hardware modules are still quite limited. This work
                 proposes a virtualizable hardware/software design
                 infrastructure (VDI) for dynamically partially
                 reconfigurable systems. Besides the gate-level hardware
                 virtualization provided by the partial reconfiguration
                 technology, VDI supports the device-level hardware
                 virtualization. In VDI, a reconfigurable hardware
                 module can be virtualized such that it can be accessed
                 efficiently by multiple applications in an interleaving
                 way. A Hot-Plugin Connector (HPC) replaces the
                 conventional device driver, such that it not only
                 assists the device-level hardware virtualization but
                 can also be reused across different hardware modules.
                 To facilitate hardware/software communication and to
                 enhance system scalability, the proposed VDI is
                 realized as a hierarchical design framework.
                 User-designed reconfigurable hardware modules can be
                 easily integrated into VDI, and are then executed as
                 hardware tasks in an operating system for
                 reconfigurable systems (OS4RS). A dynamically partially
                 reconfigurable network security system was designed
                 using VDI, which demonstrated a higher utilization of
                 reconfigurable hardware modules and a reduction by up
                 to 12.83\% of the processing time required by using the
                 conventional method in a dynamically partially
                 reconfigurable system.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "11",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Liu:2013:INL,
  author =       "Hanyu Liu and Senthilkumar T. Rajavel and Ali Akoglu",
  title =        "Integration of Net-Length Factor with Timing- and
                 Routability-Driven Clustering Algorithms",
  journal =      j-TRETS,
  volume =       "6",
  number =       "3",
  pages =        "12:1--12:??",
  month =        oct,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2517324",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Thu Mar 13 08:09:45 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "In FPGA CAD flow, the clustering stage builds the
                 foundation for placement and routing stages and affects
                 performance parameters, such as routability, delay, and
                 channel width significantly. Net sharing and
                 criticality are the two most commonly used factors in
                 clustering cost functions. With this study, we first
                 derive a third term, net-length factor, and then design
                 a generic method for integrating net length into the
                 clustering algorithms. Net-length factor enables
                 characterizing the nets based on the routing stress
                 they might cause during later stages of the CAD flow
                 and is essential for enhancing the routability of the
                 design. We evaluate the effectiveness of integrating
                 net length as a factor into the well-known timing
                 (T-VPack)-, depopulation (T-NDPack)-, and routability
                 (iRAC and T-RPack)-driven clustering algorithms.
                 Through exhaustive experimental studies, we show that
                 net-length factor consistently helps improve the
                 channel-width performance of routability-,
                 depopulation-, and timing-driven clustering algorithms
                 that do not explicitly target low fan-out nets in their
                 cost functions. Particularly, net-length factor leads
                 to average reduction in channel width for T-VPack,
                 T-RPack, and T-NDPack by 11.6\%, 10.8\%, and 14.2\%,
                 respectively, and in a majority of the cases, improves
                 the critical-path delay without increasing the array
                 size.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "12",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Mehta:2013:UGE,
  author =       "Gayatri Mehta and Carson Crawford and Xiaozhong Luo
                 and Natalie Parde and Krunalkumar Patel and Brandon
                 Rodgers and Anil Kumar Sistla and Anil Yadav and Marc
                 Reisner",
  title =        "{UNTANGLED}: a Game Environment for Discovery of
                 Creative Mapping Strategies",
  journal =      j-TRETS,
  volume =       "6",
  number =       "3",
  pages =        "13:1--13:??",
  month =        oct,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2517325",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Thu Mar 13 08:09:45 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "The problem of creating efficient mappings of dataflow
                 graphs onto specific architectures (i.e., solving the
                 place and route problem) is incredibly challenging. The
                 difficulty is especially acute in the area of
                 Coarse-Grained Reconfigurable Architectures (CGRAs) to
                 the extent that solving the mapping problem may remove
                 a significant bottleneck to adoption. We believe that
                 the next generation of mapping algorithms will exhibit
                 pattern recognition, the ability to learn from
                 experience, and identification of creative solutions,
                 all of which are human characteristics. This manuscript
                 describes our game UNTANGLED, developed and fine-tuned
                 over the course of a year to allow us to capture and
                 analyze human mapping strategies. It also describes our
                 results to date. We find that the mapping problem can
                 be crowdsourced very effectively, that players can
                 outperform existing algorithms, and that successful
                 player strategies share many elements in common. Based
                 on our observations and analysis, we make concrete
                 recommendations for future research directions for
                 mapping onto CGRAs.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "13",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Hormigo:2013:SRC,
  author =       "Javier Hormigo and Gabriel Caffarena and Juan P.
                 Oliver and Eduardo Boemo",
  title =        "Self-Reconfigurable Constant Multiplier for {FPGA}",
  journal =      j-TRETS,
  volume =       "6",
  number =       "3",
  pages =        "14:1--14:??",
  month =        oct,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2490830",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Thu Mar 13 08:09:45 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Constant multipliers are widely used in signal
                 processing applications to implement the multiplication
                 of signals by a constant coefficient. However, in some
                 applications, this coefficient remains invariable only
                 during an interval of time, and then, its value changes
                 to adapt to new circumstances. In this article, we
                 present a self-reconfigurable constant multiplier
                 suitable for LUT-based FPGAs able to reload the
                 constant in runtime. The pipelined architecture
                 presented is easily scalable to any multiplicand and
                 constant sizes, for unsigned and signed
                 representations. It can be reprogrammed in 16 clock
                 cycles, equivalent to less than 100 ns in current
                 FPGAs. This value is significantly smaller than FPGA
                 partial configuration times. The presented approach is
                 more efficient in terms of area and speed when compared
                 to generic multipliers, achieving up to 91\% area
                 reduction and up to 102\% speed improvement for the
                 case-study circuits tested. The power consumption of
                 the proposed multipliers are in the range of those of
                 slice-based multipliers provided by the vendor.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "14",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Gharibian:2013:ASL,
  author =       "Farnaz Gharibian and Lesley Shannon and Peter Jamieson
                 and Kevin Chung",
  title =        "Analyzing System-Level Information's Correlation to
                 {FPGA} Placement",
  journal =      j-TRETS,
  volume =       "6",
  number =       "3",
  pages =        "15:1--15:??",
  month =        oct,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2501985",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Thu Mar 13 08:09:45 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "One popular placement algorithms for
                 Field-Programmable Gate Arrays (FPGAs) is called
                 Simulated Annealing (SA). This algorithm tries to
                 create a good quality placement from a flattened design
                 that no longer contains any high-level information
                 related to the original design hierarchy. Placement is
                 an NP-hard problem, and as the size and complexity of
                 designs implemented on FPGAs increases, SA does not
                 scale well to find good solutions in a timely fashion.
                 In this article, we investigate if system-level
                 information can be reconstructed from a flattened
                 netlist and evaluate how that information is realized
                 in terms of its locality in the final placement. If
                 there is a strong relationship between good quality
                 placements and system-level information, then it may be
                 possible to divide a large design into smaller
                 components and improve the time needed to create a good
                 quality placement. Our preliminary results suggest that
                 the locality property of the information embedded in
                 the system-level HDL structure (i.e. ``module'',
                 ``always'', and ``if'' statements) is greatly affected
                 by designer HDL coding style. Therefore, a
                 reconstructive algorithm, called Affinity Propagation,
                 is also considered as a possible method of generating a
                 meaningful coarse-grain picture of the design.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "15",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Plavec:2013:ETD,
  author =       "Franjo Plavec and Zvonko Vranesic and Stephen Brown",
  title =        "Exploiting Task- and Data-Level Parallelism in
                 Streaming Applications Implemented in {FPGAs}",
  journal =      j-TRETS,
  volume =       "6",
  number =       "4",
  pages =        "16:1--16:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2535932",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Thu Mar 13 08:09:46 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "This article describes the design and implementation
                 of a novel compilation flow that implements circuits in
                 FPGAs from a streaming programming language. The
                 streaming language supported is called FPGA Brook and
                 is based on the existing Brook language. It allows
                 system designers to express applications in a way that
                 exposes parallelism, which can be exploited through
                 hardware implementation. FPGA Brook supports
                 replication, allowing parts of an application to be
                 implemented as multiple hardware units operating in
                 parallel. Hardware units are interconnected through
                 FIFO buffers which use the small memory modules
                 available in FPGAs. The FPGA Brook automated design
                 flow uses a source-to-source compiler, developed as a
                 part of this work, and combines it with a commercial
                 behavioral synthesis tool to generate the hardware
                 implementation. A suite of benchmark applications was
                 developed in FPGA Brook and implemented using our
                 design flow. Experimental results indicate that
                 performance of many applications scales well with
                 replication. Our benchmark applications also achieve
                 significantly better results than corresponding
                 implementations using a commercial behavioral synthesis
                 tool. We conclude that using an automated design flow
                 for implementation of streaming applications in FPGAs
                 is a promising methodology.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "16",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Ananthan:2013:RPH,
  author =       "T. Ananthan and M. V. Vaidyan",
  title =        "A Reconfigurable Parallel Hardware Implementation of
                 the Self-Tuning Regulator",
  journal =      j-TRETS,
  volume =       "6",
  number =       "4",
  pages =        "17:1--17:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2535934",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Thu Mar 13 08:09:46 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "The self-tuning regulator (STR) is a popular adaptive
                 control algorithm. A high-performance computer is
                 required for its implementation due to the heavy online
                 computational burden. To extend STR for more real-time
                 applications, a parallel hardware implementation on a
                 low-cost reconfigurable computer is presented. The
                 hardware was incorporated with multistage matrix
                 multiplication (MMM) and trace technique to enhance the
                 processing speed. This design was deeply pipelined to
                 achieve high throughput. The algorithm was prototyped
                 on a Xilinx field-programmable gate array (FPGA) device
                 with a maximum operating frequency of 210.436 MHz.
                 Application-specific integrated circuit (ASIC)
                 implementation of STR was reported.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "17",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Leow:2013:AME,
  author =       "Yoon Kah Leow and Ali Akoglu and Susan Lysecky",
  title =        "An Analytical Model for Evaluating Static Power of
                 Homogeneous {FPGA} Architectures",
  journal =      j-TRETS,
  volume =       "6",
  number =       "4",
  pages =        "18:1--18:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2535935",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Thu Mar 13 08:09:46 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "As capacity of the field-programmable gate arrays
                 (FPGAs) continues to increase, power dissipated in the
                 logic and routing resources has become a critical
                 concern for FPGA architects. Recent studies have shown
                 that static power is fast approaching the dynamic power
                 in submicron devices. In this article, we propose an
                 analytical model for relating homogeneous
                 island-style-based FPGA architecture to static power.
                 Current FPGA power models are tightly coupled with CAD
                 tools. Our CAD-independent model captures the static
                 power for a given FPGA architecture based on estimates
                 of routing and logic resource utilizations from a
                 pre-technology mapped netlist. We observe an average
                 correlation ratio (C-Ratio) of 95\% and a minimum
                 absolute percentage error (MAPE) rate of 15\% with
                 respect to the experimental results generated by the
                 Versatile Placement Routing (VPR) tool over the MCNC
                 benchmarks. Our model offers application engineers and
                 FPGA architects the capability to evaluate the impact
                 of their design choices on static power without having
                 to go through CAD-intensive investigations.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "18",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Ben-Asher:2013:OWS,
  author =       "Yosi Ben-Asher and Ron Meldiner and Nadav Rotem",
  title =        "Optimizing Wait States in the Synthesis of Memory
                 References with Unpredictable Latencies",
  journal =      j-TRETS,
  volume =       "6",
  number =       "4",
  pages =        "19:1--19:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2535936",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Thu Mar 13 08:09:46 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "We consider the problem of synthesizing circuits (from
                 C to Verilog) that are optimized to handle
                 unpredictable latencies of memory operations.
                 Unpredictable memory latencies can occur due to the use
                 of on chip caches, DRAM memory modules, buffers/queues,
                 or multiport memories. Typically, high-level synthesis
                 compilers assume fixed and known memory latencies, and
                 thus are able to schedule the code's operations
                 efficiently. The operations in the source code are
                 scheduled into states of a state machine whose states
                 will be synthesized to Verilog. The goal is to minimize
                 scheduling length by maximizing the number of
                 operations (and in particular memory operations) that
                 are executed in parallel at the same state. However,
                 with unpredictable latencies, there can be an
                 exponential number of possible orders in which these
                 parallel memory operations can terminate. Thus, in
                 order to minimize the scheduling, we need a different
                 schedule for any such order. This is not practical, and
                 we show a technique of synthesizing a compact state
                 machine that schedules only a small subset of these
                 possible termination orders. Our results show that this
                 compact state machine can improve the execution time
                 compared to a regular scheduling that waits for the
                 termination of all the active memory references in
                 every state.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "19",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Kornaros:2014:DPT,
  author =       "George Kornaros and Dionisios Pnevmatikatos",
  title =        "Dynamic Power and Thermal Management of {NoC-Based}
                 Heterogeneous {MPSoCs}",
  journal =      j-TRETS,
  volume =       "7",
  number =       "1",
  pages =        "1:1--1:??",
  month =        feb,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2567658",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Thu Mar 13 08:09:47 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Advances in silicon process technology have made it
                 possible to include multiple processor cores on a
                 single die. Billion transistor architectures usually in
                 the form of networks-on-chip present a wide range of
                 challenges in design, microarchitecture, and
                 algorithmic levels with significant impact to system
                 performance and power consumption. In this article, we
                 propose efficient methods and mechanisms that exploit a
                 heterogeneous network-on-chip (NoC) to achieve a power-
                 and thermal-aware coherent system. To this end, we
                 utilize different management techniques which employ
                 dynamic frequency scaling circuitry and power and
                 temperature sensors per node to achieve real-time
                 workload prediction and allocation at node and system
                 level by low-cost threads. The developed heterogeneous
                 multicoprocessing infrastructure is utilized to
                 evaluate diverse policies for power-aware computing in
                 terms of effectiveness and in relation to distributed
                 sensor-conscious management. The proposed
                 reconfigurable architecture supports coprocessor
                 accelerators per node, monitors the program's power
                 profile on-the-fly, and balances power and thermal
                 behavior at the NoC level. Overall, these techniques
                 form a system exploration methodology using a
                 multi-FPGA emulation platform showing a minimum
                 complexity overhead.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "1",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Iskander:2014:HLA,
  author =       "Yousef Iskander and Cameron Patterson and Stephen
                 Craven",
  title =        "High-Level Abstractions and Modular Debugging for
                 {FPGA} Design Validation",
  journal =      j-TRETS,
  volume =       "7",
  number =       "1",
  pages =        "2:1--2:??",
  month =        feb,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2567662",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Thu Mar 13 08:09:47 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Design validation is the most time-consuming task in
                 the FPGA design cycle. Although manufacturers and
                 third-party vendors offer a range of tools that provide
                 visibility and control of the different stages of a
                 design, many require that the design be fully
                 re-implemented for even simple parameter modifications
                 or do not allow the design to be run at full speed.
                 Designs are typically first modeled using a high-level
                 language then later rewritten in a hardware description
                 language, first for simulation and then later modified
                 for synthesis. IP and third-party cores may differ
                 during these final two stages complicating development
                 and validation. The developed approach provides two
                 means of directly validating synthesized hardware
                 designs. The first allows the original high-level model
                 written in C or C++ to be directly coupled to the
                 synthesized hardware, abstracting away the traditional
                 gate-level view of designs. A high-level programmatic
                 interface allows the synthesized design to be validated
                 directly by the software reference model. The second
                 approach provides an alternative view to FPGAs within
                 the scope of a traditional software debugger. This
                 debug framework leverages partially reconfigurable
                 regions to accelerate the modification of dynamic,
                 software-like breakpoints for low-level analysis and
                 provides a automatable, scriptable, command-line
                 interface directly to a running design on an FPGA.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "2",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Jin:2014:FAS,
  author =       "Minxi Jin and Tsutomu Maruyama",
  title =        "Fast and Accurate Stereo Vision System on {FPGA}",
  journal =      j-TRETS,
  volume =       "7",
  number =       "1",
  pages =        "3:1--3:??",
  month =        feb,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2567659",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Thu Mar 13 08:09:47 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "In this article, we present a fast and high quality
                 stereo matching algorithm on FPGA using cost
                 aggregation (CA) and fast locally consistent (FLC)
                 dense stereo. In many software programs, global
                 matching algorithms are used in order to obtain
                 accurate disparity maps. Although their error rates are
                 considerably low, their processing speeds are far from
                 that required for real-time processing because of their
                 complex processing sequences. In order to realize
                 real-time processing, many hardware systems have been
                 proposed to date. They have achieved considerably high
                 processing speeds; however, their error rates are not
                 as good as those of software programs, because simple
                 local matching algorithms have been widely used in
                 those systems. In our system, sophisticated local
                 matching algorithms (CA and FLC) that are suitable for
                 FPGA implementation are used to achieve low error rate
                 while maintaining the high processing speed. We
                 evaluate the performance of our circuit on Xilinx
                 Vertex-6 FPGAs. Its error rate is comparable to that of
                 top-level software algorithms, and its processing speed
                 is nearly 2 clock cycles per pixel, which reaches 507.9
                 fps for 640 480 pixel images.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "3",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Ulusel:2014:FDE,
  author =       "Onur Ulusel and Kumud Nepal and R. Iris Bahar and
                 Sherief Reda",
  title =        "Fast Design Exploration for Performance, Power and
                 Accuracy Tradeoffs in {FPGA-Based} Accelerators",
  journal =      j-TRETS,
  volume =       "7",
  number =       "1",
  pages =        "4:1--4:??",
  month =        feb,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2567661",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Thu Mar 13 08:09:47 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "The ease-of-use and reconfigurability of FPGAs makes
                 them an attractive platform for accelerating
                 algorithms. However, accelerating becomes a challenging
                 task as the large number of possible design parameters
                 lead to different accelerator variants. In this
                 article, we propose techniques for fast design
                 exploration and multi-objective optimization to quickly
                 identify both algorithmic and hardware parameters that
                 optimize these accelerators. This information is used
                 to run regression analysis and train mathematical
                 models within a nonlinear optimization framework to
                 identify the optimal algorithm and design parameters
                 under various objectives and constraints. To automate
                 and improve the model generation process, we propose
                 the use of L$_1$ -regularized least squares regression
                 techniques.We implement two real-time image processing
                 accelerators as test cases: one for image deblurring
                 and one for block matching. For these designs, we
                 demonstrate that by sampling only a small fraction of
                 the design space (0.42\% and 1.1\%), our modeling
                 techniques are accurate within 2\%--4\% for area and
                 throughput, 8\%--9\% for power, and 5\%--6\% for
                 arithmetic accuracy. We show speedups of 340$ \times $
                 and 90$ \times $ in time for the test cases compared to
                 brute-force enumeration. We also identify the optimal
                 set of parameters for a number of scenarios (e.g.,
                 minimizing power under arithmetic inaccuracy bounds).",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "4",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Kim:2014:FPF,
  author =       "Lok-Won Kim and Sameh Asaad and Ralph Linsker",
  title =        "A Fully Pipelined {FPGA} Architecture of a Factored
                 Restricted {Boltzmann} Machine Artificial Neural
                 Network",
  journal =      j-TRETS,
  volume =       "7",
  number =       "1",
  pages =        "5:1--5:??",
  month =        feb,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2539125",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Thu Mar 13 08:09:47 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Artificial neural networks (ANNs) are a natural target
                 for hardware acceleration by FPGAs and GPGPUs because
                 commercial-scale applications can require days to weeks
                 to train using CPUs, and the algorithms are highly
                 parallelizable. Previous work on FPGAs has shown how
                 hardware parallelism can be used to accelerate a
                 ``Restricted Boltzmann Machine'' (RBM) ANN algorithm,
                 and how to distribute computation across multiple
                 FPGAs. Here we describe a fully pipelined parallel
                 architecture that exploits ``mini-batch'' training
                 (combining many input cases to compute each set of
                 weight updates) to further accelerate ANN training. We
                 implement on an FPGA, for the first time to our
                 knowledge, a more powerful variant of the basic RBM,
                 the ``Factored RBM'' (fRBM). The fRBM has proved
                 valuable in learning transformations and in discovering
                 features that are present across multiple types of
                 input. We obtain (in simulation) a 100-fold
                 acceleration (vs. CPU software) for an fRBM having N =
                 256 units in each of its four groups (two input, one
                 output, one intermediate group of units) running on a
                 Virtex-6 LX760 FPGA. Many of the architectural features
                 we implement are applicable not only to fRBMs, but to
                 basic RBMs and other ANN algorithms more broadly.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "5",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Luu:2014:VNG,
  author =       "Jason Luu and Jeffrey Goeders and Michael Wainberg and
                 Andrew Somerville and Thien Yu and Konstantin
                 Nasartschuk and Miad Nasr and Sen Wang and Tim Liu and
                 Nooruddin Ahmed and Kenneth B. Kent and Jason Anderson
                 and Jonathan Rose and Vaughn Betz",
  title =        "{VTR 7.0}: Next Generation Architecture and {CAD}
                 System for {FPGAs}",
  journal =      j-TRETS,
  volume =       "7",
  number =       "2",
  pages =        "6:1--6:??",
  month =        jun,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2617593",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Mon Jun 30 18:26:23 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Exploring architectures for large, modern FPGAs
                 requires sophisticated software that can model and
                 target hypothetical devices. Furthermore, research into
                 new CAD algorithms often requires a complete and open
                 source baseline CAD flow. This article describes recent
                 advances in the open source Verilog-to-Routing (VTR)
                 CAD flow that enable further research in these areas.
                 VTR now supports designs with multiple clocks in both
                 timing analysis and optimization. Hard adder/carry
                 logic can be included in an architecture in various
                 ways and significantly improves the performance of
                 arithmetic circuits. The flow now models energy
                 consumption, an increasingly important concern. The
                 speed and quality of the packing algorithms have been
                 significantly improved. VTR can now generate a netlist
                 of the final post-routed circuit which enables detailed
                 simulation of a design for a variety of purposes. We
                 also release new FPGA architecture files and models
                 that are much closer to modern commercial
                 architectures, enabling more realistic experiments.
                 Finally, we show that while this version of VTR
                 supports new and complex features, it has a 1.5$ \times
                 $ compile time speed-up for simple architectures and a
                 6$ \times $ speed-up for complex architectures compared
                 to the previous release, with no degradation to timing
                 or wire-length quality.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "6",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{J:2014:MAN,
  author =       "Soumya J. and Ashish Sharma and Santanu
                 Chattopadhyay",
  title =        "Multi-Application Network-on-Chip Design using Global
                 Mapping and Local Reconfiguration",
  journal =      j-TRETS,
  volume =       "7",
  number =       "2",
  pages =        "7:1--7:??",
  month =        jun,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2556944",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Mon Jun 30 18:26:23 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "This article proposes a reconfigurable Network-on-Chip
                 (NoC) architecture based on mesh topology. It provides
                 a local reconfiguration of cores to connect to any of
                 the neighboring routers, depending upon the currently
                 executing application. The area overhead for this local
                 reconfiguration has been shown to be very small. We
                 have also presented the strategy to map the cores of an
                 application set onto this architecture. This has been
                 achieved via a two-phase procedure. In the first phase,
                 the cores of the combined application set are mapped
                 tentatively to individual routers, minimizing the
                 communication cost. In the second phase, for each
                 application, positions of individual cores are
                 finalized. A core gets attached to any neighbor of its
                 tentative allocation. We have proposed Integer Linear
                 Programming (ILP) formulation of both the phases. Since
                 ILP takes large amount of CPU time, we have also
                 formulated a Particle Swarm Optimization (PSO)-based
                 solution for the two phases. A heuristic approach has
                 also been developed for the reconfiguration. Comparison
                 of communication cost, latency and network energy have
                 been carried out for the applications, before and after
                 reconfiguration. It shows significant improvement in
                 performance via reconfiguration.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "7",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Lei:2014:FIS,
  author =       "Yuanwu Lei and Lei Guo and Yong Dou and Sheng Ma and
                 Jinbo Xu",
  title =        "{FPGA} Implementation of a Special-Purpose {VLIW}
                 Structure for Double-Precision Elementary Function",
  journal =      j-TRETS,
  volume =       "7",
  number =       "2",
  pages =        "8:1--8:??",
  month =        jun,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2617594",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Mon Jun 30 18:26:23 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/fparith.bib;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "In the current article, the capability and flexibility
                 of field programmable gate-arrays (FPGAs) to implement
                 IEEE-754 double-precision floating-point elementary
                 functions are explored. To perform various elementary
                 functions on the unified hardware efficiently, we
                 propose a special-purpose very long instruction word
                 (VLIW) processor, called DP_VELP. This processor is
                 equipped with multiple basic units, and its performance
                 is improved through an explicitly parallel technique.
                 Pipelined evaluation of polynomial approximation with
                 Estrin's scheme is proposed, by scheduling basic
                 components in an optimal order to avoid data hazard
                 stalls and achieve minimal latency. The custom VLIW
                 processor can achieve high scalability. Under the
                 control of specific VLIW instructions, the basic units
                 are combined into special-purpose hardware for
                 elementary functions. Common elementary functions are
                 presented as examples to illustrate the design of
                 elementary function in DP_VELP in detail. Minimax
                 approximation scheme is used to reduce degree of
                 polynomial. Compromise between the size of lookup table
                 and the latency is discussed, and the internal
                 precision is carefully planned to guarantee accuracy of
                 the result. Finally, we create a prototype of the
                 DP_VELP unit and an FPGA accelerator based on the
                 DP_VELP unit on a Xilinx XC6VLX760 FPGA chip to
                 implement the SGP4/SDP4 application. Compared with
                 previous researches, the proposed design can achieve
                 low latency with a reasonable amount of resources and
                 evaluate a variety of elementary functions with the
                 unified hardware to satisfy the demands in scientific
                 applications. Experimental results show that the
                 proposed design guarantees more than 99\% of correct
                 rounding. Moreover, the SGP4/SDP4 accelerator, which is
                 equipped with 39 DP_VELP units and runs at 200 MHz,
                 outperforms the parallel software approach with
                 hyper-thread technology on an Intel Xeon Quad E5620 CPU
                 at 2.40 GHz by a factor of 7X.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "8",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Clemente:2014:MSA,
  author =       "Juan Antonio Clemente and Ivan Beretta and Vincenzo
                 Rana and David Atienza and Donatella Sciuto",
  title =        "A Mapping-Scheduling Algorithm for Hardware
                 Acceleration on Reconfigurable Platforms",
  journal =      j-TRETS,
  volume =       "7",
  number =       "2",
  pages =        "9:1--9:??",
  month =        jun,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2611562",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Mon Jun 30 18:26:23 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Reconfigurable platforms are a promising technology
                 that offers an interesting trade-off between
                 flexibility and performance, which many recent embedded
                 system applications demand, especially in fields such
                 as multimedia processing. These applications typically
                 involve multiple ad-hoc tasks for hardware
                 acceleration, which are usually represented using
                 formalisms such as Data Flow Diagrams (DFDs), Data Flow
                 Graphs (DFGs), Control and Data Flow Graphs (CDFGs) or
                 Petri Nets. However, none of these models is able to
                 capture at the same time the pipeline behavior between
                 tasks (that therefore can coexist in order to minimize
                 the application execution time), their communication
                 patterns, and their data dependencies. This article
                 proves that the knowledge of all this information can
                 be effectively exploited to reduce the resource
                 requirements and the timing performance of modern
                 reconfigurable systems, where a set of hardware
                 accelerators is used to support the computation. For
                 this purpose, this article proposes a novel task
                 representation model, named Temporal Constrained Data
                 Flow Diagram (TCDFD), which includes all this
                 information. This article also presents a
                 mapping-scheduling algorithm that is able to take
                 advantage of the new TCDFD model. It aims at minimizing
                 the dynamic reconfiguration overhead while meeting the
                 communication requirements among the tasks.
                 Experimental results show that the presented approach
                 achieves up to 75\% of resources saving and up to 89\%
                 of reconfiguration overhead reduction with respect to
                 other state-of-the-art techniques for reconfigurable
                 platforms.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "9",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Hoang:2014:IMD,
  author =       "Anh-Tuan Hoang and Takeshi Fujino",
  title =        "Intra-Masking Dual-Rail Memory on {LUT} Implementation
                 for {SCA}-Resistant {AES} on {FPGA}",
  journal =      j-TRETS,
  volume =       "7",
  number =       "2",
  pages =        "10:1--10:??",
  month =        jun,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2617595",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Mon Jun 30 18:26:23 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "In current countermeasure design trends against
                 differential power analysis (DPA), security at gate
                 level is required in addition to the security
                 algorithm. Several dual-rail pre-charge logics (DPL)
                 have been proposed to achieve this goal. Designs using
                 ASIC can attain this goal owing to its backend design
                 restrictions on placement and routing. However,
                 implementing these designs on field programmable gate
                 arrays (FPGA) without information leakage is still a
                 problem because of the difficulty involved in the
                 restrictions on placement and routing on FPGA. This
                 article describes our novel masked dual-rail
                 pre-charged memory approach, called `intra-masking
                 dual-rail memory (IMDRM) on LUT', and its
                 implementation on FPGA for Side-Channel
                 Attack-resistant (SCA-resistant) AES. In the proposed
                 design, all unsafe nodes, such as unmasking and
                 masking, and parts of dual-rail memory with unsafe
                 buses (buses that are not masked) are packed into a
                 single LUT. This makes them balanced and independent of
                 the placement and routing tools. Inputs and outputs of
                 all LUTs are masked, and so can be considered safe
                 signals. Several LUTs can be combined to create a safe
                 SBox. The design is independent of the cryptographic
                 algorithm, and hence, it can be applied to available
                 cryptographic standards such as DES or AES as well as
                 future standards. It requires no special placement or
                 route constraints in its implementation. A correlation
                 power analysis (CPA) attack on 1,000,000 traces of AES
                 implementation on FPGA showed that the secret
                 information is well protected against first-order
                 side-channel attacks. Even though the number of LUTs
                 used for memory in this implementation is seven times
                 greater than that of the conventional unprotected
                 single-rail memory table-lookup AES and three times
                 greater than the implementation based on a composite
                 field, it requires a smaller number of LUTs than all
                 other advanced SCA-resistant implementations such as
                 the wave dynamic differential logic, masked dual-rail
                 pre-charge logic, and threshold.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "10",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Becker:2014:ITS,
  author =       "Tobias Becker",
  title =        "Introduction to the {TRETS} Special Section on the
                 {Workshop on Self-Awareness in Reconfigurable Computing
                 Systems (SRCS'12)}",
  journal =      j-TRETS,
  volume =       "7",
  number =       "2",
  pages =        "11:1--11:??",
  month =        jun,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2611564",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Mon Jun 30 18:26:23 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "11",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Panerati:2014:CIL,
  author =       "Jacopo Panerati and Martina Maggio and Matteo
                 Carminati and Filippo Sironi and Marco Triverio and
                 Marco D. Santambrogio",
  title =        "Coordination of Independent Loops in Self-Adaptive
                 Systems",
  journal =      j-TRETS,
  volume =       "7",
  number =       "2",
  pages =        "12:1--12:??",
  month =        jun,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2611563",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Mon Jun 30 18:26:23 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Nowadays, the same piece of code should run on
                 different architectures, providing performance
                 guarantees in a variety of environments and situations.
                 To this end, designers often integrate existing systems
                 with ad-hoc adaptive strategies able to tune specific
                 parameters that impact performance or energy-for
                 example, frequency scaling. However, these strategies
                 interfere with one another and unpredictable
                 performance degradation may occur due to the
                 interaction between different entities. In this
                 article, we propose a software approach to
                 reconfiguration when different strategies, called
                 loops, are encapsulated in the system and are available
                 to be activated. Our solution to loop coordination is
                 based on machine learning and it selects a policy for
                 the activation of loops inside of a system without
                 prior knowledge. We implemented our solution on top of
                 GNU/Linux and evaluated it with a significant subset of
                 the PARSEC benchmark suite.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "12",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Agne:2014:SAM,
  author =       "Andreas Agne and Markus Happe and Achim L{\"o}sch and
                 Christian Plessl and Marco Platzner",
  title =        "Self-Awareness as a Model for Designing and Operating
                 Heterogeneous Multicores",
  journal =      j-TRETS,
  volume =       "7",
  number =       "2",
  pages =        "13:1--13:??",
  month =        jun,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2617596",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Mon Jun 30 18:26:23 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Self-aware computing is a paradigm for structuring and
                 simplifying the design and operation of computing
                 systems that face unprecedented levels of system
                 dynamics and thus require novel forms of adaptivity.
                 The generality of the paradigm makes it applicable to
                 many types of computing systems and, previously,
                 researchers started to introduce concepts of
                 self-awareness to multicore architectures. In our work
                 we build on a recent reference architectural framework
                 as a model for self-aware computing and instantiate it
                 for an FPGA-based heterogeneous multicore running the
                 ReconOS reconfigurable architecture and operating
                 system. After presenting the model for self-aware
                 computing and ReconOS, we demonstrate with a case study
                 how a multicore application built on the principle of
                 self-awareness, autonomously adapts to changes in the
                 workload and system state. Our work shows that the
                 reference architectural framework as a model for
                 self-aware computing can be practically applied and
                 allows us to structure and simplify the design process,
                 which is essential for designing complex future
                 computing systems.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "13",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Beckhoff:2014:DTI,
  author =       "Christian Beckhoff and Dirk Koch and Jim Torresen",
  title =        "Design Tools for Implementing Self-Aware and
                 Fault-Tolerant Systems on {FPGAs}",
  journal =      j-TRETS,
  volume =       "7",
  number =       "2",
  pages =        "14:1--14:??",
  month =        jun,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2617597",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Mon Jun 30 18:26:23 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "To fully exploit the capabilities of runtime
                 reconfigurable FPGAs in self-aware systems, design
                 tools are required that exceed the capabilities of
                 present vendor design tools. Such tools must allow the
                 implementation of scalable reconfigurable systems with
                 various different partial modules that might be loaded
                 to different positions of the device at runtime. This
                 comprises several complex tasks, including
                 floorplanning, communication architecture synthesis,
                 physical constraints generation, physical
                 implementation, and timing verification all the way
                 down to the final bitstream generation. In this
                 article, we present how our GoAhead framework helps in
                 implementing self-aware systems on FPGAs with a minimum
                 of user interaction.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "14",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Niu:2014:SAT,
  author =       "Xinyu Niu and Qiwei Jin and Wayne Luk and Stephen
                 Weston",
  title =        "A Self-Aware Tuning and Self-Aware Evaluation Method
                 for Finite-Difference Applications in Reconfigurable
                 Systems",
  journal =      j-TRETS,
  volume =       "7",
  number =       "2",
  pages =        "15:1--15:??",
  month =        jun,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2617598",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Mon Jun 30 18:26:23 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Finite-difference methods are computationally
                 intensive and required by many applications. Parameters
                 of a finite-difference algorithm, such as grid size,
                 can be varied to generate design space which contains
                 algorithm instances with different constant
                 coefficients. An algorithm instance with specific
                 coefficients can either be mapped into general
                 operators to construct static designs, or be
                 implemented as constant-specific operators to form
                 dynamic designs, which require runtime reconfiguration
                 to update algorithm coefficients. This article proposes
                 a tuning method to explore the design space to optimise
                 both the static and the dynamic designs, and an
                 evaluation method to select the design with maximum
                 overall throughput, based on algorithm characteristics,
                 design properties, available resources and runtime data
                 size. For benchmark applications option pricing and
                 Reverse-Time Migration (RTM), over 50\% reduction in
                 resource consumption has been achieved for both static
                 designs and dynamic designs, while meeting precision
                 requirements. For a single hardware implementation, the
                 RTM design optimised with the proposed approach is
                 expected to run 1.8 times faster than the best
                 published design. The tuned static designs run
                 thousands of times faster than the dynamic designs for
                 algorithms with small data size, while the tuned
                 dynamic designs achieve up to 5.9 times speedup over
                 the corresponding static designs for large-scale
                 finite-difference algorithms.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "15",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Laforest:2014:CMP,
  author =       "Charles Eric Laforest and Zimo Li and Tristan O'rourke
                 and Ming G. Liu and J. Gregory Steffan",
  title =        "Composing Multi-Ported Memories on {FPGAs}",
  journal =      j-TRETS,
  volume =       "7",
  number =       "3",
  pages =        "16:1--16:??",
  month =        aug,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2629629",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Mon Sep 1 10:42:23 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Multi-ported memories are challenging to implement on
                 FPGAs since the block RAMs included in the fabric
                 typically have only two ports. Hence we must construct
                 memories requiring more than two ports, either out of
                 logic elements or by combining multiple block RAMs. We
                 present a thorough exploration and evaluation of the
                 design space of FPGA-based soft multi-ported memories
                 for conventional solutions, and also for the recently
                 proposed Live Value Table (LVT) [LaForest and Steffan
                 2010] and XOR [LaForest et al. 2012] approaches to
                 unidirectional port memories, reporting results for
                 both Altera and Xilinx FPGAs. Additionally, we
                 thoroughly evaluate and compare with a recent LVT-based
                 approach to bidirectional port memories [Choi et al.
                 2012].",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "16",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Peng:2014:BAH,
  author =       "Yuanxi Peng and Manuel Salda{\~n}a and Christopher A.
                 Madill and Xiaofeng Zou and Paul Chow",
  title =        "Benefits of Adding Hardware Support for Broadcast and
                 Reduce Operations in {MPSoC} Applications",
  journal =      j-TRETS,
  volume =       "7",
  number =       "3",
  pages =        "17:1--17:??",
  month =        aug,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2629470",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Mon Sep 1 10:42:23 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "MPI has been used as a parallel programming model for
                 supercomputers and clusters and recently in
                 MultiProcessor Systems-on-Chip (MPSoC). One component
                 of MPI is collective communication and its performance
                 is key for certain parallel applications to achieve
                 good speedups. Previous work showed that, with
                 synthetic communication-only benchmarks, communication
                 improvements of up to 11.4-fold and 22-fold for
                 broadcast and reduce operations, respectively, can be
                 achieved by providing hardware support at the network
                 level in a Network-on-Chip (NoC). However, these
                 numbers do not provide a good estimation of the
                 advantage for actual applications, as there are other
                 factors that affect performance besides communications,
                 such as computation. To this end, we extend our
                 previous work by evaluating the impact of hardware
                 support over a set of five parallel application kernels
                 of varying computation-to-communication ratios. By
                 introducing some useful computation to the performance
                 evaluation, we obtain more representative results of
                 the benefits of adding hardware support for broadcast
                 and reduce operations. The experiments show that
                 applications with lower computation-to-communication
                 ratios benefit the most from hardware support as they
                 highly depend on efficient collective communications to
                 achieve better scalability. We also extend our work by
                 doing more analysis on clock frequency, resource usage,
                 power, and energy. The results show reasonable
                 scalability for resource utilization and power in the
                 network interfaces as the number of channels increases
                 and that, even though more power is dissipated in the
                 network interfaces due to the added hardware, the total
                 energy used can still be less if the actual speedup is
                 sufficient. The application kernels are executed in a
                 24-embedded-processor system distributed across four
                 FPGAs.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "17",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Anderson:2014:ISI,
  author =       "Jason Anderson and Kiyoung Choi",
  title =        "Introduction to the {Special Issue on the 11th
                 International Conference on Field-Programmable
                 Technology (FPT'12)}",
  journal =      j-TRETS,
  volume =       "7",
  number =       "3",
  pages =        "18:1--18:??",
  month =        aug,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2655712",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Mon Sep 1 10:42:23 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "18",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Cheah:2014:IDB,
  author =       "Hui Yan Cheah and Fredrik Brosser and Suhaib A. Fahmy
                 and Douglas L. Maskell",
  title =        "The {iDEA DSP} Block-Based Soft Processor for
                 {FPGAs}",
  journal =      j-TRETS,
  volume =       "7",
  number =       "3",
  pages =        "19:1--19:??",
  month =        aug,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2629443",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Mon Sep 1 10:42:23 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "DSP blocks in modern FPGAs can be used for a wide
                 range of arithmetic functions, offering increased
                 performance while saving logic resources for other
                 uses. They have evolved to better support a plethora of
                 signal processing tasks, meaning that in other
                 application domains they may be underutilised. The
                 DSP48E1 primitives in new Xilinx devices support
                 dynamic programmability that can help extend their
                 usefulness; the specific function of a DSP block can be
                 modified on a cycle-by-cycle basis. However, the
                 standard synthesis flow does not leverage this
                 flexibility in the vast majority of cases. The lean DSP
                 Extension Architecture (iDEA) presented in this article
                 builds around the dynamic programmability of a single
                 DSP48E1 primitive, with minimal additional logic to
                 create a general-purpose processor supporting a full
                 instruction-set architecture. The result is a very
                 compact, fast processor that can execute a full gamut
                 of general machine instructions. We show a number of
                 simple applications compiled using an MIPS compiler and
                 translated to the iDEA instruction set, comparing with
                 a Xilinx MicroBlaze to show estimated performance
                 figures. Being based on the DSP48E1, this processor can
                 be deployed across next-generation Xilinx Artix-7,
                 Kintex-7, Virtex-7, and Zynq families.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "19",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Abdelfattah:2014:NCF,
  author =       "Mohamed S. Abdelfattah and Vaughn Betz",
  title =        "Networks-on-Chip for {FPGAs}: Hard, Soft or Mixed?",
  journal =      j-TRETS,
  volume =       "7",
  number =       "3",
  pages =        "20:1--20:??",
  month =        aug,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2629442",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Mon Sep 1 10:42:23 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "As FPGA capacity increases, a growing challenge is
                 connecting ever-more components with the current
                 low-level FPGA interconnect while keeping designers
                 productive and on-chip communication efficient. We
                 propose augmenting FPGAs with networks-on-chip (NoCs)
                 to simplify design, and we show that this can be done
                 while maintaining or even improving silicon efficiency.
                 We compare the area and speed efficiency of each NoC
                 component when implemented hard versus soft to explore
                 the space and inform our design choices. We then build
                 on this component-level analysis to architect hard NoCs
                 and integrate them into the FPGA fabric; these NoCs are
                 on average 20--23$ \times $ smaller and 5--6$ \times $
                 faster than soft NoCs. A 64-node hard NoC uses only
                 ~2\% of an FPGA's silicon area and metallization. We
                 introduce a new communication efficiency metric:
                 silicon area required per realized communication
                 bandwidth. Soft NoCs consume 4960 mm$^2$ /TBps, but
                 hard NoCs are 84$ \times $ more efficient at 59 mm$^2$
                 /TBps. Informed design can further reduce the area
                 overhead of NoCs to 23 mm$^2$ /TBps, which is only 2.6$
                 \times $ less efficient than the simplest
                 point-to-point soft links (9 mm$^2$ /TBps). Despite
                 this almost comparable efficiency, NoCs can switch data
                 across the entire FPGA while point-to-point links are
                 very limited in capability; therefore, hard NoCs are
                 expected to improve FPGA efficiency for more complex
                 styles of communication.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "20",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Chen:2014:GMA,
  author =       "Liang Chen and Tulika Mitra",
  title =        "Graph Minor Approach for Application Mapping on
                 {CGRAs}",
  journal =      j-TRETS,
  volume =       "7",
  number =       "3",
  pages =        "21:1--21:??",
  month =        aug,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2655242",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Mon Sep 1 10:42:23 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Coarse-Grained Reconfigurable Arrays (CGRAs) exhibit
                 high performance, improved flexibility, low cost, and
                 power efficiency for various application domains.
                 Compute-intensive loop kernels, which are perfect
                 candidates to be executed on CGRAs, are usually mapped
                 through modified modulo scheduling algorithms. These
                 algorithms should be capable of performing both
                 placement and routing. We formalize the CGRA mapping
                 problem as a graph minor containment problem. We
                 essentially test whether the dataflow graph
                 representing the loop kernel is a minor of the modulo
                 routing resource graph representing the CGRA resources
                 and their interconnects. We design an exact graph minor
                 testing approach that exploits the unique properties of
                 both the dataflow graph and the routing resource graph
                 to significantly prune the search space. We introduce
                 additional heuristic strategies that drastically
                 improve the compilation time while still generating
                 optimal or near-optimal mapping solutions. Experimental
                 evaluation confirms the efficiency of our approach.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "21",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Kim:2014:USU,
  author =       "Changmoo Kim and Mookyoung Chung and Yeongon Cho and
                 Mario Konijnenburg and Soojung Ryu and Jeongwook Kim",
  title =        "{ULP-SRP}: Ultra Low-Power {Samsung} Reconfigurable
                 Processor for Biomedical Applications",
  journal =      j-TRETS,
  volume =       "7",
  number =       "3",
  pages =        "22:1--22:??",
  month =        aug,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2629610",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Mon Sep 1 10:42:23 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "The latest biomedical applications require low energy
                 consumption, high performance, and wide
                 energy-performance scalability to adapt to various
                 working environments. In this study, we present
                 ULP-SRP, an energy-efficient reconfigurable processor
                 for biomedical applications. ULP-SRP uses a
                 Coarse-Grained Reconfigurable Array (CGRA) for
                 high-performance data processing with low energy
                 consumption. We adopted a compact-size CGRA and
                 modified it to support dynamically switchable three
                 performance modes with fine-grained power gating in
                 order to further optimize the energy consumption. The
                 energy-performance scalability is also accomplished
                 with multiple performance modes and a Unified Memory
                 Architecture (UMA). Experimental results show that
                 ULP-SRP achieved 59\% energy reduction compared to
                 previous works. A technique of dynamic CGRA mode
                 changing gives 18.9\% energy reduction. ULP-SRP is a
                 good candidate for future mobile healthcare devices.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "22",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Voros:2014:ISI,
  author =       "Nikolaos Voros and Guy Gogniat",
  title =        "Introduction to the Special Issue on the {7th
                 International Workshop on Reconfigurable
                 Communication-centric Systems-on-Chip (ReCoSoC'12)}",
  journal =      j-TRETS,
  volume =       "7",
  number =       "3",
  pages =        "23:1--23:??",
  month =        aug,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2655710",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Mon Sep 1 10:42:23 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "23",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Brugger:2014:RRF,
  author =       "Christian Brugger and Dominic Hillenbrand and Matthias
                 Balzer",
  title =        "{RIVER}: Reconfigurable Flow and Fabric for Real-Time
                 Signal Processing on {FPGAs}",
  journal =      j-TRETS,
  volume =       "7",
  number =       "3",
  pages =        "24:1--24:??",
  month =        aug,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2655238",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Mon Sep 1 10:42:23 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "For high-performance embedded hard-real-time systems,
                 ASICs and FPGAs hold advantages over general-purpose
                 processors and graphics accelerators (GPUs). However,
                 developing signal processing architectures from scratch
                 requires significant resources. Our design methodology
                 is based on sets of configurable building blocks that
                 provide storage, dataflow, computation, and control.
                 Based on our building blocks, we generate hundreds of
                 thousands of our dynamic streaming engine processors
                 that we call DSEs. We store our DSEs in a repository
                 that can be queried for (online) design space
                 exploration. From this repository, DSEs can be
                 downloaded and instantiated within milliseconds on
                 FPGAs. If a loss of flexibility can be tolerated then
                 ASIC implementations are feasible as well. In this
                 article we focus on FPGA implementations. Our DSEs vary
                 in cores, computational lanes, bitwidths, power
                 consumption, and frequency. To the best of our
                 knowledge we are the first to propose online design
                 space exploration based on repositories of precompiled
                 cores that are assembled of common building blocks. For
                 demonstration purposes we map algorithms for image
                 processing and financial mathematics to DSEs and
                 compare the performance to existing highly optimized
                 signal and graphics accelerators.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "24",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Itturiet:2014:APE,
  author =       "F{\'a}bio Itturiet and Gabriel Nazar and Ronaldo
                 Ferreira and {\'A}lvaro Moreira and Luigi Carro",
  title =        "Adaptive Parallelism Exploitation under Physical and
                 Real-Time Constraints for Resilient Systems",
  journal =      j-TRETS,
  volume =       "7",
  number =       "3",
  pages =        "25:1--25:??",
  month =        aug,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2556943",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Mon Sep 1 10:42:23 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "This article introduces the resilient adaptive
                 algebraic architecture that aims at adapting
                 parallelism exploitation of a matrix multiplication
                 algorithm in a time-deterministic fashion to reduce
                 power consumption while meeting real-time deadlines
                 present in most DSP-like applications. The proposed
                 architecture provides low-overhead error correction
                 capabilities relying on the hardware implementation of
                 the algorithm-based fault-tolerance method that is
                 executed concurrently with matrix multiplication,
                 providing efficient occupation of memory and power
                 resources. The Resilient Adaptive Algebraic
                 Architecture (RA$^3$ ) is evaluated using three
                 real-time industrial case studies from the telecom and
                 multimedia application domains to present the design
                 space exploration and the adaptation possibilities the
                 architecture offers to hardware designers. RA$^3$ is
                 compared in its performance and energy efficiency with
                 standard high-performance architectures, namely a GPU
                 and an out-of-order general-purpose processor. Finally,
                 we present the results of fault injection campaigns in
                 order to measure the architecture resilience to soft
                 errors.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "25",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Lam:2014:EFA,
  author =       "Siew-Kei Lam and Christopher T. Clarke and
                 Thambipillai Srikanthan",
  title =        "Exploiting {FPGA}-Aware Merging of Custom Instructions
                 for Runtime Reconfiguration",
  journal =      j-TRETS,
  volume =       "7",
  number =       "3",
  pages =        "26:1--26:??",
  month =        aug,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2655240",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Mon Sep 1 10:42:23 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Runtime reconfiguration is a promising solution for
                 reducing hardware cost in embedded systems, without
                 compromising on performance. We present a framework
                 that aims to increase the performance benefits of
                 reconfigurable processors that support full or partial
                 runtime reconfiguration. The proposed framework
                 achieves this by: (1) providing a means for choosing
                 suitable custom instruction selection heuristics, (2)
                 leveraging FPGA-aware merging of custom instructions to
                 maximize the reconfigurable logic block utilization in
                 each configuration, and (3) incorporating a
                 hierarchical loop partitioning strategy to reduce
                 runtime reconfiguration overhead. We show that the
                 performance gain can be improved by employing suitable
                 custom instruction selection heuristics that, in turn,
                 depend on the reconfigurable resource constraints and
                 the merging factor (extent to which the selected custom
                 instructions can be merged). The hierarchical loop
                 partitioning strategy leads to an average performance
                 gain of over 31\% and 46\% for full and partial runtime
                 reconfiguration, respectively. Performance gain can be
                 further increased to over 52\% and 70\% for full and
                 partial runtime reconfiguration, respectively, by
                 exploiting FPGA-aware merging of custom instructions.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "26",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Guillet:2014:EUM,
  author =       "S{\'e}bastien Guillet and Florent de Lamotte and
                 Nicolas le Griguer and {\'E}ric Rutten and Guy Gogniat
                 and Jean-Philippe Diguet",
  title =        "Extending {UML\slash MARTE} to Support Discrete
                 Controller Synthesis, Application to Reconfigurable
                 Systems-on-Chip Modeling",
  journal =      j-TRETS,
  volume =       "7",
  number =       "3",
  pages =        "27:1--27:??",
  month =        aug,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2629628",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Mon Sep 1 10:42:23 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "This article presents the first framework to design
                 and synthesize a formal controller managing dynamic
                 reconfiguration, using a model-driven engineering
                 methodology based on an extension of UML/MARTE. The
                 implementation technique highlights the combination of
                 hard configuration constraints using weights ( control
                 part )-ensured statically and fulfilled by the system
                 at runtime-and soft constraints ( decision part ) that,
                 given a set of correct and accessible configurations,
                 choose one of them. An application model of an image
                 processing application is presented, then transformed
                 and synthesized to be executed on a Xilinx platform to
                 show how the controller, executed on a Microblaze,
                 manages the hardware reconfigurations.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "27",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Butler:2015:HSH,
  author =       "Jon T. Butler and Tsutomu Sasao",
  title =        "High-Speed Hardware Partition Generation",
  journal =      j-TRETS,
  volume =       "7",
  number =       "4",
  pages =        "1:1--1:??",
  month =        jan,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2629472",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Fri Feb 13 07:24:19 MST 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "We demonstrate circuits that generate set and integer
                 partitions on a set S of n objects at a rate of one per
                 clock. Partitions are ways to group elements of a set
                 together and have been extensively studied by
                 researchers in algorithm design and theory. We offer
                 two versions of a hardware set partition generator. In
                 the first, partitions are produced in lexicographical
                 order in response to successive clock pulses. In the
                 second, an index input determines the set partition
                 produced. Such circuits are useful in the hardware
                 implementation of the optimum distribution of tasks to
                 processors. We show circuits for integer partitions as
                 well. Our circuits are combinational. For large n, they
                 can have a large delay. However, one can easily
                 pipeline them to produce one partition per clock
                 period. We show (1) analytical and (2) experimental
                 time/complexity results that quantify the efficiency of
                 our designs. For example, our results show that a
                 hardware set partition generator running on a 100MHz
                 FPGA produces partitions at a rate that is
                 approximately 10 times the rate of a software
                 implementation on a processor running at 2.26GHz.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "1",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Paulino:2015:RAB,
  author =       "Nuno Paulino and Jo{\~a}o Canas Ferreira and Jo{\~a}o
                 M. P. Cardoso",
  title =        "A Reconfigurable Architecture for Binary Acceleration
                 of Loops with Memory Accesses",
  journal =      j-TRETS,
  volume =       "7",
  number =       "4",
  pages =        "2:1--2:??",
  month =        jan,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2629468",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Fri Feb 13 07:24:19 MST 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "This article presents a reconfigurable
                 hardware/software architecture for binary acceleration
                 of embedded applications. A Reconfigurable Processing
                 Unit (RPU) is used as a coprocessor of the General
                 Purpose Processor (GPP) to accelerate the execution of
                 repetitive instruction sequences called Megablocks. A
                 toolchain detects Megablocks from instruction traces
                 and generates customized RPU implementations. The
                 implementation of Megablocks with memory accesses uses
                 a memory-sharing mechanism to support concurrent
                 accesses to the entire address space of the GPP's data
                 memory. The scheduling of load/store operations and
                 memory access handling have been optimized to minimize
                 the latency introduced by memory accesses. The system
                 is able to dynamically switch the execution between the
                 GPP and the RPU when executing the original binaries of
                 the input application. Our proof-of-concept prototype
                 achieved geometric mean speedups of 1.60$ \times $ and
                 1.18$ \times $ for, respectively, a set of 37
                 benchmarks and a subset considering the 9 most complex
                 benchmarks. With respect to a previous version of our
                 approach, we achieved geometric mean speedup
                 improvements from 1.22 to 1.53 for the 10 benchmarks
                 previously used.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "2",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Dhawan:2015:AEN,
  author =       "Udit Dhawan and Andr{\'e} Dehon",
  title =        "Area-Efficient Near-Associative Memories on {FPGAs}",
  journal =      j-TRETS,
  volume =       "7",
  number =       "4",
  pages =        "3:1--3:??",
  month =        jan,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2629471",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Fri Feb 13 07:24:19 MST 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Associative memories can map sparsely used keys to
                 values with low latency but can incur heavy area
                 overheads. The lack of customized hardware for
                 associative memories in today's mainstream FPGAs
                 exacerbates the overhead cost of building these
                 memories using the fixed address match BRAMs. In this
                 article, we develop a new, FPGA-friendly, memory system
                 architecture based on a multiple hash scheme that is
                 able to achieve near-associative performance without
                 the area-delay overheads of a fully associative memory
                 on FPGAs. At the same time, we develop a novel memory
                 management algorithm that allows us to statistically
                 mimic an associative memory. Using the proposed
                 architecture as a 64KB L1 data cache, we show that it
                 is able to achieve near-associative miss rates while
                 consuming 3--13 $ \times $ fewer FPGA memory resources
                 for a set of benchmark programs from the SPEC CPU2006
                 suite than fully associative memories generated by the
                 Xilinx Coregen tool. Benefits for our architecture
                 increase with key width, allowing area reduction up to
                 100 $ \times $. Mapping delay is also reduced to 3.7ns
                 for a 1,024-entry flat version or 6.1ns for an
                 area-efficient version compared to 17.6ns for a fully
                 associative memory for a 64-bit key on a Xilinx Virtex
                 6 device.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "3",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Llamocca:2015:DEP,
  author =       "Daniel Llamocca and Marios Pattichis",
  title =        "Dynamic Energy, Performance, and Accuracy Optimization
                 and Management Using Automatically Generated
                 Constraints for Separable {$2$D} {FIR} Filtering for
                 Digital Video Processing",
  journal =      j-TRETS,
  volume =       "7",
  number =       "4",
  pages =        "4:1--4:??",
  month =        jan,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2629623",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Fri Feb 13 07:24:19 MST 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "There is strong interest in the development of
                 dynamically reconfigurable systems that can meet
                 real-time constraints on energy, performance, and
                 accuracy. The generation of real-time constraints will
                 significantly expand the applicability of dynamically
                 reconfigurable systems to new domains, such as digital
                 video processing. We develop a dynamically
                 reconfigurable 2D FIR filtering system that can meet
                 real-time constraints in energy, performance, and
                 accuracy (EPA). The real-time constraints are
                 automatically generated based on user input, image
                 types associated with video communications, and video
                 content. We first generate a set of Pareto-optimal
                 realizations, described by their EPA values and
                 associated 2D FIR hardware description bitstreams.
                 Dynamic management is then achieved by selecting
                 Pareto-optimal realizations that meet the automatically
                 generated time-varying EPA constraints. We validate our
                 approach using three different 2D Gaussian filters.
                 Filter realizations are evaluated in terms of the
                 required energy per frame, accuracy of the resulting
                 image, and performance in frames per second. We
                 demonstrate dynamic EPA management by applying a
                 Difference of Gaussians (DOG) filter to standard video
                 sequences. For video frame sizes that are equal to or
                 larger than the VGA resolution, compared to a static
                 implementation, our dynamic system provides significant
                 reduction in the total energy consumption
                 ({$>$30}\%).",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "4",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Gojman:2015:GLG,
  author =       "Benjamin Gojman and Sirisha Nalmela and Nikil Mehta
                 and Nicholas Howarth and Andr{\'e} Dehon",
  title =        "{GROK-LAB}: Generating Real On-chip Knowledge for
                 Intra-cluster Delays Using Timing Extraction",
  journal =      j-TRETS,
  volume =       "7",
  number =       "4",
  pages =        "5:1--5:??",
  month =        jan,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2597889",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Fri Feb 13 07:24:19 MST 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Timing Extraction identifies the delay of fine-grained
                 components within an FPGA. From these computed delays,
                 the delay of any path can be calculated. Moreover, a
                 comparison of the fine-grained delays allows a detailed
                 understanding of the amount and type of process
                 variation that exists in the FPGA. To obtain these
                 delays, Timing Extraction measures, using only
                 resources already available in the FPGA, the delay of a
                 small subset of the total paths in the FPGA. We apply
                 Timing Extraction to the Logic Array Block (LAB) on an
                 Altera Cyclone III FPGA to obtain a view of the delay
                 down to near-individual LUT SRAM cell granularity,
                 characterizing components with delays on the order of
                 tens to a few hundred picoseconds with a resolution of
                 $ \pm {}3.2 $ ps, matching the expected error bounds.
                 This information reveals that the 65nm process used
                 has, on average, random variation of $ \sigma \mu = 4.0
                 \% $ with components having an average maximum spread
                 of 83ps. Timing Extraction also shows that as $ V_{DD}
                 $ decreases from 1.2V to 0.9V in a Cyclone IV 60nm
                 FPGA, paths slow down, and variation increases from $
                 \sigma \mu = 4.3 \% $ to $ \sigma \mu = 5.8 \% $, a
                 clear indication that lowering $ V_{DD} $ magnifies the
                 impact of random variation.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "5",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Mahram:2015:NBH,
  author =       "Atabak Mahram and Martin C. Herbordt",
  title =        "{NCBI BLASTP} on High-Performance Reconfigurable
                 Computing Systems",
  journal =      j-TRETS,
  volume =       "7",
  number =       "4",
  pages =        "6:1--6:??",
  month =        jan,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2629691",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Fri Feb 13 07:24:19 MST 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "The BLAST sequence alignment program is a central
                 application in bioinformatics. The de facto standard
                 version, NCBI BLAST, uses complex heuristics that make
                 it challenging to simultaneously achieve both high
                 performance and exact agreement. We propose a system
                 that uses novel FPGA-based filters that reduce the
                 input database by over 99.97\% without loss of
                 sensitivity. There are several contributions. First is
                 design of the filters themselves, which perform two-hit
                 seeding, exhaustive ungapped alignment, and exhaustive
                 gapped alignments, respectively. Second is the coupling
                 of the filters, especially the two-hit seeding and the
                 ungapped alignment. Third is pipelining the filters in
                 a single design, including maintaining load balancing
                 as data are reduced by orders of magnitude at each
                 stage. Fourth is the optimization required to maintain
                 operating frequency for the resulting complex design.
                 And finally, there is system integration both in
                 hardware (the Convey HC1-EX) and software (NCBI
                 BLASTP). We present results for various usage scenarios
                 and find complete agreement and a factor of nearly 5x
                 speedup over a fully parallel implementation of the
                 reference code on a contemporaneous CPU. We believe
                 that the resulting system is the leading
                 per-socket-accelerated NCBI BLAST.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "6",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Swierczynski:2015:PSE,
  author =       "Pawel Swierczynski and Amir Moradi and David Oswald
                 and Christof Paar",
  title =        "Physical Security Evaluation of the Bitstream
                 Encryption Mechanism of {Altera Stratix II} and
                 {Stratix III} {FPGAs}",
  journal =      j-TRETS,
  volume =       "7",
  number =       "4",
  pages =        "7:1--7:??",
  month =        jan,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2629462",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Fri Feb 13 07:24:19 MST 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "To protect Field-Programmable Gate Array (FPGA)
                 designs against Intellectual Property (IP) theft and
                 related issues such as product cloning, all major FPGA
                 manufacturers offer a mechanism to encrypt the
                 bitstream that is used to configure the FPGA. From a
                 mathematical point of view, the employed encryption
                 algorithms (e.g., Advanced Encryption Standard (AES) or
                 3DES) are highly secure. However, it has been shown
                 that the bitstream encryption feature of several FPGA
                 families is susceptible to side-channel attacks based
                 on measuring the power consumption of the cryptographic
                 module. In this article, we present the first
                 successful attack on the bitstream encryption of the
                 Altera Stratix II and Stratix III FPGA families. To
                 this end, we analyzed the Quartus II software and
                 reverse engineered the details of the proprietary and
                 unpublished schemes used for bitstream encryption on
                 Stratix II and Stratix III. Using this knowledge, we
                 demonstrate that the full 128-bit AES key of a Stratix
                 II as well as the full 256-bit AES key of a Stratix III
                 can be recovered by means of side-channel attacks. In
                 both cases, the attack can be conducted in a few hours.
                 The complete bitstream of these FPGAs that are
                 (seemingly) protected by the bitstream encryption
                 feature can hence fall into the hands of a competitor
                 or criminal-possibly implying system-wide damage if
                 confidential information such as proprietary encryption
                 schemes or secret keys programmed into the FPGA are
                 extracted. In addition to lost IP, reprogramming the
                 attacked FPGA with modified code, for instance, to
                 secretly plant a hardware Trojan, is a particularly
                 dangerous scenario for many security-critical
                 applications.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "7",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Vliegen:2015:SRD,
  author =       "Jo Vliegen and Nele Mentens and Ingrid Verbauwhede",
  title =        "Secure, Remote, Dynamic Reconfiguration of {FPGAs}",
  journal =      j-TRETS,
  volume =       "7",
  number =       "4",
  pages =        "8:1--8:??",
  month =        jan,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2629423",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Fri Feb 13 07:24:19 MST 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "With the widespread availability of broadband
                 Internet, Field-Programmable Gate Arrays (FPGAs) can
                 get remote updates in the field. This provides hardware
                 and software updates, and enables issue solving and
                 upgrade ability without device modification. In order
                 to prevent an attacker from eavesdropping or
                 manipulating the configuration data, security is a
                 necessity. This work describes an architecture that
                 allows the secure, remote reconfiguration of an FPGA.
                 The architecture is partially dynamically
                 reconfigurable and it consists of a static partition
                 that handles the secure communication protocol and a
                 single reconfigurable partition that holds the main
                 application. Our solution distinguishes itself from
                 existing work in two ways: it provides entity
                 authentication and it avoids the use of a trusted third
                 party. The former provides protection against active
                 attackers on the communication channel, while the
                 latter reduces the number of reliable entities.
                 Additionally, this work provides basic countermeasures
                 against simple power-oriented side-channel analysis
                 attacks. The result is an implementation that is
                 optimized toward minimal resource occupation. Because
                 configuration updates occur infrequently, configuration
                 speed is of minor importance with respect to area. A
                 prototype of the proposed design is implemented, using
                 5,702 slices and having minimal downtime.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "8",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Chau:2015:MAP,
  author =       "Thomas C. P. Chau and Xinyu Niu and Alison Eele and
                 Jan Maciejowski and Peter Y. K. Cheung and Wayne Luk",
  title =        "Mapping Adaptive Particle Filters to Heterogeneous
                 Reconfigurable Systems",
  journal =      j-TRETS,
  volume =       "7",
  number =       "4",
  pages =        "9:1--9:??",
  month =        jan,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2629469",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Fri Feb 13 07:24:19 MST 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "This article presents an approach for mapping
                 real-time applications based on particle filters (PFs)
                 to heterogeneous reconfigurable systems, which
                 typically consist of multiple FPGAs and CPUs. A method
                 is proposed to adapt the number of particles
                 dynamically and to utilise runtime reconfigurability of
                 FPGAs for reduced power and energy consumption. A data
                 compression scheme is employed to reduce communication
                 overhead between FPGAs and CPUs. A mobile robot
                 localisation and tracking application is developed to
                 illustrate our approach. Experimental results show that
                 the proposed adaptive PF can reduce up to 99\% of
                 computation time. Using runtime reconfiguration, we
                 achieve a 25\% to 34\% reduction in idle power. A 1U
                 system with four FPGAs is up to 169 times faster than a
                 single-core CPU and 41 times faster than a 1U CPU
                 server with 12 cores. It is also estimated to be 3
                 times faster than a system with four GPUs.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "9",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Miller:2015:GBA,
  author =       "Bailey Miller and Frank Vahid and Tony Givargis and
                 Philip Brisk",
  title =        "Graph-Based Approaches to Placement of Processing
                 Element Networks on {FPGAs} for Physical Model
                 Simulation",
  journal =      j-TRETS,
  volume =       "7",
  number =       "4",
  pages =        "10:1--10:??",
  month =        jan,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2629521",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Fri Feb 13 07:24:19 MST 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Physical models utilize mathematical equations to
                 characterize physical systems like airway mechanics,
                 neuron networks, or chemical reactions. Previous work
                 has shown that field programmable gate arrays (FPGAs)
                 execute physical models efficiently. To improve the
                 implementation of physical models on FPGAs, this
                 article leverages graph theoretic techniques to
                 synthesize physical models onto FPGAs. The first phase
                 maps physical model equations onto a structured virtual
                 processing element (PE) graph using graph theoretic
                 folding techniques. The second phase maps the
                 structured virtual PE graph onto physical PE regions on
                 an FPGA using graph embedding theory. A simulated
                 annealing algorithm is introduced that can map any
                 physical model onto an FPGA regardless of the model's
                 underlying topology. We further extend the simulated
                 annealing approach by leveraging existing graph drawing
                 algorithms to generate the initial placement. Compared
                 to previous work on physical model implementation on
                 FPGAs, embedding increases clock frequency by 25\% on
                 average (for applicable topologies), whereas simulated
                 annealing increases frequency by 13\% on average. The
                 embedding approach typically produces a circuit whose
                 frequency is limited by the FPGA clock instead of
                 routing. Additionally, complex models that could not
                 previously be routed due to complexity were made
                 routable when using placement constraints.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "10",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{DiCarlo:2015:SSA,
  author =       "Stefano {Di Carlo} and Giulio Gambardella and Paolo
                 Prinetto and Daniele Rolfo and Pascal Trotta",
  title =        "{SATTA}: a {Self-Adaptive Temperature-Based TDF
                 Awareness} Methodology for Dynamically Reconfigurable
                 {FPGAs}",
  journal =      j-TRETS,
  volume =       "8",
  number =       "1",
  pages =        "1:1--1:??",
  month =        feb,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2659001",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Mar 7 16:45:25 MST 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Dependability issues due to nonfunctional properties
                 are emerging as a major cause of faults in modern
                 digital systems. Effective countermeasures have to be
                 developed to properly manage their critical timing
                 effects. This article presents a methodology to avoid
                 transition delay faults in field-programmable gate
                 array (FPGA)-based systems, with low area overhead. The
                 approach is able to exploit temperature information and
                 aging characteristics to minimize the cost in terms of
                 performances degradation and power consumption. The
                 architecture of a hardware manager able to avoid delay
                 faults is presented and analyzed extensively, as well
                 as its integration in the standard implementation
                 design flow.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "1",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Cooke:2015:TAF,
  author =       "Patrick Cooke and Jeremy Fowers and Greg Brown and
                 Greg Stitt",
  title =        "A Tradeoff Analysis of {FPGAs}, {GPUs}, and Multicores
                 for Sliding-Window Applications",
  journal =      j-TRETS,
  volume =       "8",
  number =       "1",
  pages =        "2:1--2:??",
  month =        feb,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2659000",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Mar 7 16:45:25 MST 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "The increasing usage of hardware accelerators such as
                 Field-Programmable Gate Arrays (FPGAs) and Graphics
                 Processing Units (GPUs) has significantly increased
                 application design complexity. Such complexity results
                 from a larger design space created by numerous
                 combinations of accelerators, algorithms, and hw/sw
                 partitions. Exploration of this increased design space
                 is critical due to widely varying performance and
                 energy consumption for each accelerator when used for
                 different application domains and different use cases.
                 To address this problem, numerous studies have
                 evaluated specific applications across different
                 architectures. In this article, we analyze an important
                 domain of applications, referred to as sliding-window
                 applications, implemented on FPGAs, GPUs, and multicore
                 CPUs. For each device, we present optimization
                 strategies and analyze use cases where each device is
                 most effective. The results show that, for large input
                 sizes, FPGAs can achieve speedups of up to $ 5.6 \times
                 $ and $ 58 \times $ compared to GPUs and multicore
                 CPUs, respectively, while also using up to an order of
                 magnitude less energy. For small input sizes and
                 applications with frequency-domain algorithms, GPUs
                 generally provide the best performance and energy.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "2",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Quinn:2015:CFE,
  author =       "Heather Quinn and Diane Roussel-Dupre and Mike Caffrey
                 and Paul Graham and Michael Wirthlin and Keith Morgan
                 and Anthony Salazar and Tony Nelson and Will Howes and
                 Eric Johnson and Jon Johnson and Brian Pratt and Nathan
                 Rollins and Jim Krone",
  title =        "The {Cibola Flight Experiment}",
  journal =      j-TRETS,
  volume =       "8",
  number =       "1",
  pages =        "3:1--3:??",
  month =        feb,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2629556",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Mar 7 16:45:25 MST 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Over the past 15 years many organizations have
                 researched the use of Static-Random Access Memory
                 (SRAM)-based Field-Programmable Gate Arrays (FPGAs) in
                 space. Although the components can provide a
                 performance improvement over radiation-hardened
                 processing components, random soft errors can occur
                 from the naturally occurring space radiation
                 environment. Many organizations have been developing
                 methods for characterizing, emulating, and simulating
                 radiation-induced events; mitigating and removing
                 radiation-induced computational errors; and designing
                 fault-tolerant reconfigurable spacecraft. Los Alamos
                 National Laboratory has fielded one of the longest
                 space-based FPGAs experiments, called the Cibola Flight
                 Experiment (CFE), using Xilinx Virtex FPGAs. CFE has
                 successfully deployed commercial SRAM FPGAs into a
                 low-Earth orbit with Single-Event Upset (SEU)
                 mitigation and was able to exploit effectively the
                 reconfigurability and customization of FPGAs in a harsh
                 radiation environment. Although older than current
                 state-of-the-art FPGAs, these same concepts are used to
                 deploy newer FPGA-based space systems since the launch
                 of the CFE satellite and will continue to be useful for
                 newer systems. In this article, we present how the
                 system was designed to be fault tolerant, prelaunch
                 predictions of expected on-orbit behaviors, and
                 on-orbit results.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "3",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Davidson:2015:IDC,
  author =       "Tom Davidson and Elias Vansteenkiste and Karel Heyse
                 and Karel Bruneel and Dirk Stroobandt",
  title =        "Identification of Dynamic Circuit Specialization
                 Opportunities in {RTL} Code",
  journal =      j-TRETS,
  volume =       "8",
  number =       "1",
  pages =        "4:1--4:??",
  month =        feb,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2629640",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Mar 7 16:45:25 MST 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Dynamic Circuit Specialization (DCS) optimizes a
                 Field-Programmable Gate Array (FPGA) design by assuming
                 a set of its input signals are constant for a
                 reasonable amount of time, leading to a smaller and
                 faster FPGA circuit. When the signals actually change,
                 a new circuit is loaded into the FPGA through runtime
                 reconfiguration. The signals the design is specialized
                 for are called parameters. For certain designs,
                 parameters can be selected so the DCS implementation is
                 both smaller and faster than the original
                 implementation. However, DCS also introduces an
                 overhead that is difficult for the designer to take
                 into account, making it hard to determine whether a
                 design is improved by DCS or not. This article presents
                 extensive results on a profiling methodology that
                 analyses Register-Transfer Level (RTL) implementations
                 of applications to check if DCS would be beneficial. It
                 proposes to use the functional density as a measure for
                 the area efficiency of an implementation, as this
                 measure contains both the overhead and the gains of a
                 DCS implementation. The first step of the methodology
                 is to analyse the dynamic behaviour of signals in the
                 design, to find good parameter candidates. The overhead
                 of DCS is highly dependent on this dynamic behaviour. A
                 second stage calculates the functional density for each
                 candidate and compares it to the functional density of
                 the original design. The profiling methodology resulted
                 in three implementations of a profiling tool, the
                 DCS-RTL profiler. The execution time, accuracy, and the
                 quality of each implementation is assessed based on
                 data from 10 RTL designs. All designs, except for the
                 two 16-bit adaptable Finite Impulse Response (FIR)
                 filters, are analysed in 1 hour or less.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "4",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Iturbe:2015:MAH,
  author =       "Xabier Iturbe and Khaled Benkrid and Chuan Hong and
                 Ali Ebrahim and Raul Torrego and Tughrul Arslan",
  title =        "Microkernel Architecture and Hardware Abstraction
                 Layer of a Reliable Reconfigurable Real-Time Operating
                 System {(R3TOS)}",
  journal =      j-TRETS,
  volume =       "8",
  number =       "1",
  pages =        "5:1--5:??",
  month =        feb,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2629639",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Mar 7 16:45:25 MST 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "This article presents a new solution for easing the
                 development of reconfigurable applications using
                 Field-Programable Gate Arrays (FPGAs). Namely, our
                 Reliable Reconfigurable Real-Time Operating System
                 (R3TOS) provides OS-like support for partially
                 reconfigurable FPGAs. Unlike related works, R3TOS is
                 founded on the basis of resource reusability and
                 computation ephemerality. It makes intensive use of
                 reconfiguration at very fine FPGA granularity, keeping
                 the logic resources used only while performing
                 computation and releasing them as soon as it is
                 completed. To achieve this goal, R3TOS goes beyond the
                 traditional approach of using reconfigurable slots with
                 fixed boundaries interconnected by means of a static
                 communication infrastructure. Instead, R3TOS approaches
                 a static route-free system where nearly everything is
                 reconfigurable. The tasks are concatenated to form a
                 computation chain through which partial results
                 naturally flow, and data are exchanged among remotely
                 located tasks using FPGA's reconfiguration mechanism or
                 by means of ``removable'' routing circuits. In this
                 article, we describe the R3TOS microkernel architecture
                 as well as its hardware abstraction services and
                 programming interface. Notably, the article presents a
                 set of novel circuits and mechanisms to overcome the
                 limitations and exploit the opportunities of Xilinx
                 reconfigurable technology in the scope of hardware
                 multitasking and dependability.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "5",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Shi:2015:IDD,
  author =       "Kan Shi and David Boland and George A.
                 Constantinides",
  title =        "Imprecise Datapath Design: an Overclocking Approach",
  journal =      j-TRETS,
  volume =       "8",
  number =       "2",
  pages =        "6:1--6:??",
  month =        apr,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2629527",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue May 19 17:05:20 MDT 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "In this article, we describe an alternative circuit
                 design methodology when considering trade-offs between
                 accuracy, performance, and silicon area. We compare two
                 different approaches that could trade accuracy for
                 performance. One is the traditional approach where the
                 precision used in the datapath is limited to meet a
                 target latency. The other is a proposed new approach
                 which simply allows the datapath to operate without
                 timing closure. We demonstrate analytically and
                 experimentally that on average our approach obtains
                 either smaller errors or equivalent faster operating
                 frequencies in comparison to the traditional approach.
                 This is because the worst case caused by timing
                 violations only happens rarely, while precision loss
                 results in errors to most data. We also show that for
                 basic arithmetic operations such as addition, applying
                 our approach to the simple building block of ripple
                 carry adders can achieve better accuracy or performance
                 than using faster adder designs to achieve similar
                 latency.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "6",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Woods:2015:PDP,
  author =       "Louis Woods and Gustavo Alonso and Jens Teubner",
  title =        "Parallelizing Data Processing on {FPGAs} with Shifter
                 Lists",
  journal =      j-TRETS,
  volume =       "8",
  number =       "2",
  pages =        "7:1--7:??",
  month =        apr,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2629551",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue May 19 17:05:20 MDT 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Parallelism is currently seen as a mechanism to
                 minimize the impact of the power and heat dissipation
                 problems encountered in modern hardware. Data
                 parallelism-based on partitioning the data-and pipeline
                 parallelism-based on partitioning the computation-are
                 the two main approaches to leverage parallelism on a
                 wide range of hardware platforms. Unfortunately, not
                 all data processing problems are susceptible to either
                 of those strategies. An example is the skyline operator
                 [B{\"o}rzs{\"o}nyi et al. 2001], which computes the set
                 of Pareto-optimal points within a multidimensional
                 dataset. Existing approaches to parallelize the skyline
                 operator are based on data parallelism. As a result,
                 they suffer from a high overhead when merging
                 intermediate results because of the lack of a global
                 view of the problem inherent to partitioning the input
                 data. In this article, we show how to combine pipeline
                 with data parallelism on a Field-Programmable Gate
                 Array (FPGA) for a more efficient utilization of the
                 available hardware parallelism. As we show in our
                 experiments, skyline computation using our proposed
                 technique scales linearly with the number of processing
                 elements, and the performance we achieve on a rather
                 small FPGA is comparable to that of a 64-core high-end
                 server running a state-of-the-art data parallel
                 implementation of skyline [Park et al. 2009]. The
                 proposed approach to parallelize the skyline operator
                 can be generalized to a wider range of data processing
                 problems. We demonstrate this through a novel, highly
                 parallel data structure, a shifter list, that can be
                 efficiently implemented on an FPGA. The resulting
                 template is easy to parametrize to implement a variety
                 of computationally intensive operators such as frequent
                 items, n -closest pairs, or K-means.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "7",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Cardoso:2015:GEF,
  author =       "Jo{\~a}o M. P. Cardoso and Pedro C. Diniz and
                 Katherine (Compton) Morrow",
  title =        "Guest Editorial: {FPL 2013}",
  journal =      j-TRETS,
  volume =       "8",
  number =       "2",
  pages =        "8:1--8:??",
  month =        apr,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2737805",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue May 19 17:05:20 MDT 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "8",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Ferreira:2015:RFP,
  author =       "Ricardo Ferreira and Luciana Rocha and Andr{\'e} G.
                 Santos and Jos{\'e} A. M. Nacif and Stephan Wong and
                 Luigi Carro",
  title =        "A Runtime {FPGA} Placement and Routing Using
                 Low-Complexity Graph Traversal",
  journal =      j-TRETS,
  volume =       "8",
  number =       "2",
  pages =        "9:1--9:??",
  month =        apr,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2660775",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue May 19 17:05:20 MDT 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Dynamic Partial Reconfiguration (DPaR) enables
                 efficient allocation of logic resources by adding new
                 functionalities or by sharing and/or multiplexing
                 resources over time. Placement and routing (P\&R) is
                 one of the most time-consuming steps in the DPaR flow.
                 P\&R are two independent NP-complete problems, and,
                 even for medium size circuits, traditional P\&R
                 algorithms are not capable of placing and routing
                 hardware modules at runtime. We propose a novel runtime
                 P\&R algorithm for Field-Programmable Gate Array
                 (FPGA)-based designs. Our algorithm models the FPGA as
                 an implicit graph with a direct correspondence to the
                 target FPGA. The P\&R is performed as a graph mapping
                 problem by exploring the node locality during a
                 depth-first traversal. We perform the P\&R using a
                 greedy heuristic that executes in polynomial time.
                 Unlike state-of-the-art algorithms, our approach does
                 not try similar solutions, thus allowing the P\&R to
                 execute in milliseconds. Our algorithm is also suitable
                 for P\&R in fragmented regions. We generate results for
                 a manufacturer-independent virtual FPGA. Compared with
                 the most popular P\&R tool running the same benchmark
                 suite, our algorithm is up to three orders of magnitude
                 faster.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "9",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Murray:2015:TDT,
  author =       "Kevin E. Murray and Scott Whitty and Suya Liu and
                 Jason Luu and Vaughn Betz",
  title =        "Timing-Driven {Titan}: Enabling Large Benchmarks and
                 Exploring the Gap between Academic and Commercial
                 {CAD}",
  journal =      j-TRETS,
  volume =       "8",
  number =       "2",
  pages =        "10:1--10:??",
  month =        apr,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2629579",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue May 19 17:05:20 MDT 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Benchmarks play a key role in Field-Programmable Gate
                 Array (FPGA) architecture and CAD research, enabling
                 the quantitative comparison of tools and architectures.
                 It is important that these benchmarks reflect modern
                 large-scale systems that make use of heterogeneous
                 resources; however, most current FPGA benchmarks are
                 both small and simple. In this artile, we present
                 Titan, a hybrid CAD flow that addresses these issues.
                 The flow uses Altera's Quartus II FPGA CAD software to
                 perform HDL synthesis and a conversion tool to
                 translate the result into the academic Berkeley Logic
                 Interchange Format (BLIF). Using this flow, we created
                 the Titan23 benchmark set, which consists of 23 large
                 (90K--1.8M block) benchmark circuits covering a wide
                 range of application domains. Using the Titan23
                 benchmarks and an enhanced model of Altera's Stratix IV
                 architecture, including a detailed timing model, we
                 compare the performance and quality of VPR and Quartus
                 II targeting the same architecture. We found that VPR
                 is at least $ 2.8 \times $ slower, uses $ 6.2 \times $
                 more memory, $ 2.2 \times $ more wire, and produces
                 critical paths $ 1.5 \times $ slower compared to
                 Quartus II. Finally, we identified that VPR's focus on
                 achieving a dense packing and an inability to take
                 apart clusters is responsible for a large portion of
                 the wire length and critical path delay gap.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "10",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Gan:2015:SGA,
  author =       "Lin Gan and Haohuan Fu and Wayne Luk and Chao Yang and
                 Wei Xue and Xiaomeng Huang and Youhui Zhang and
                 Guangwen Yang",
  title =        "Solving the Global Atmospheric Equations through
                 Heterogeneous Reconfigurable Platforms",
  journal =      j-TRETS,
  volume =       "8",
  number =       "2",
  pages =        "11:1--11:??",
  month =        apr,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2629581",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue May 19 17:05:20 MDT 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "One of the most essential and challenging components
                 in climate modeling is the atmospheric model. To solve
                 multiphysical atmospheric equations, developers have to
                 face extremely complex stencil kernels that are costly
                 in terms of both computing and memory resources. This
                 article aims to accelerate the solution of global
                 shallow water equations (SWEs), which is one of the
                 most essential equation sets describing atmospheric
                 dynamics. We first design a hybrid methodology that
                 employs both the host CPU cores and the
                 field-programmable gate array (FPGA) accelerators to
                 work in parallel. Through a careful adjustment of the
                 computational domains, we achieve a balanced resource
                 utilization and a further improvement of the overall
                 performance. By decomposing the resource-demanding SWE
                 kernel, we manage to map the double-precision algorithm
                 into three FPGAs. Moreover, by using fixed-point and
                 reduced-precision floating point arithmetic, we manage
                 to build a fully pipelined mixed-precision design on a
                 single FPGA, which can perform 428 floating-point and
                 235 fixed-point operations per cycle. The
                 mixed-precision design with four FPGAs running together
                 can achieve a speedup of 20 over a fully optimized
                 design on a CPU rack with two eight-core processors and
                 is 8 times faster than the fully optimized Kepler GPU
                 design. As for power efficiency, the mixed-precision
                 design with four FPGAs is 10 times more power efficient
                 than a Tianhe-1A supercomputer node.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "11",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Das:2015:ASE,
  author =       "Anup Das and Shyamsundar Venkataraman and Akash
                 Kumar",
  title =        "Autonomous Soft-Error Tolerance of {FPGA}
                 Configuration Bits",
  journal =      j-TRETS,
  volume =       "8",
  number =       "2",
  pages =        "12:1--12:??",
  month =        apr,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2629580",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue May 19 17:05:20 MDT 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Field-programmable gate arrays (FPGAs) are
                 increasingly susceptible to radiation-induced single
                 event upsets (SEUs). These upsets are predominant in a
                 space environment; however, with increasing use of
                 static RAM (SRAM) in modern FPGAs, these SEUs are
                 gaining prominence even in a terrestrial environment.
                 SEUs can flip SRAM bits of FPGA, potentially altering
                 the functionality of the implemented design. This has
                 motivated FPGA designers to investigate techniques to
                 protect the FPGA configuration bits against such
                 inadvertent bit flips (soft error). Traditionally,
                 triple modular redundancy (TMR) is used to protect the
                 FPGA bit flips. Increasing design complexity and
                 limited battery life motivate for alternative
                 approaches for soft-error tolerance. In this article,
                 we propose a technique to improve autonomous
                 fault-masking capabilities of a design by maximizing
                 the number of zeros or ones in lookup tables (LUTs).
                 The technique analyzes critical configuration bits and
                 utilizes spare resources (XOR gates and carry chains)
                 of FPGAs to selectively manipulate the logic
                 implemented in LUTs using two operations: LUT
                 restructuring and LUT decomposition. We implemented the
                 proposed approach for Xilinx Virtex-6 FPGAs and
                 validated the same with a wide set of designs from the
                 MCNC, IWLS 2005, and ITC99 benchmark suites. Results
                 demonstrate that the proposed logic restructuring
                 maximizes logic 0 (or 1) of LUTs by an average of 20\%,
                 achieving 80\% fault masking with no area overhead. The
                 fault rate of the entire design is reduced by 60\% on
                 average as compared to the existing techniques.
                 Furthermore, the logic decomposition algorithm provides
                 incremental fault-tolerance capabilities and achieves
                 an additional 5\% fault masking with an average 7\%
                 increase in slice usage. The complete methodology is
                 implemented into a tool for Xilinx FPGA and is made
                 available online for the benefit of the research
                 community. The algorithms are lightweight, and the
                 whole design flow (including Xilinx Place and Route)
                 was completed in 75 minutes for the largest benchmark
                 in the set.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "12",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Istvan:2015:HTL,
  author =       "Zsolt Istv{\'a}n and Gustavo Alonso and Michaela Blott
                 and Kees Vissers",
  title =        "A Hash Table for Line-Rate Data Processing",
  journal =      j-TRETS,
  volume =       "8",
  number =       "2",
  pages =        "13:1--13:??",
  month =        apr,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2629582",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue May 19 17:05:20 MDT 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "FPGA-based data processing is becoming increasingly
                 relevant in data centers, as the transformation of
                 existing applications into dataflow architectures can
                 bring significant throughput and power benefits.
                 Furthermore, a tighter integration of computing and
                 network is appealing, as it overcomes traditional
                 bottlenecks between CPUs and network interfaces, and
                 dramatically reduces latency. In this article, we
                 present the design of a novel hash table, a fundamental
                 building block used in many applications, to enable
                 data processing on FPGAs close to the network. We
                 present a fully pipelined design capable of sustaining
                 consistent 10Gbps line-rate processing by deploying a
                 concurrent mechanism to handle hash collisions. We
                 address additional design challenges such as support
                 for a broad range of key sizes without stalling the
                 pipeline through careful matching of lookup time with
                 packet reception time. Finally, the design is based on
                 a scalable architecture that can be easily
                 parameterized to work with different memory types
                 operating at different access speeds and latencies. We
                 have tested the proposed hash table in an FPGA-based
                 memcached appliance implementing a main-memory
                 key-value store in hardware. The hash table is used to
                 index 2 million entries in 24GB of external DDR3 DRAM
                 while sustaining 13 million requests per second, the
                 maximum packet rate that can be achieved with UDP
                 packets on a 10Gbps link for this application.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "13",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Huang:2015:ECO,
  author =       "Qijing Huang and Ruolong Lian and Andrew Canis and
                 Jongsok Choi and Ryan Xi and Nazanin Calagar and
                 Stephen Brown and Jason Anderson",
  title =        "The Effect of Compiler Optimizations on High-Level
                 Synthesis-Generated Hardware",
  journal =      j-TRETS,
  volume =       "8",
  number =       "3",
  pages =        "14:1--14:??",
  month =        may,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2629547",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue May 19 17:05:24 MDT 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "We consider the impact of compiler optimizations on
                 the quality of high-level synthesis (HLS)-generated
                 field-programmable gate array (FPGA) hardware. Using an
                 HLS tool implemented within the state-of-the-art LLVM
                 compiler, we study the effect of compiler optimizations
                 on the hardware metrics of circuit area, execution
                 cycles, FMax, and wall-clock time. We evaluate 56
                 different compiler optimizations implemented within
                 LLVM and show that some optimizations significantly
                 affect hardware quality. Moreover, we show that
                 hardware quality is also affected by some optimization
                 parameter values, as well as the order in which
                 optimizations are applied. We then present a new
                 HLS-directed approach to compiler optimizations,
                 wherein we execute partial HLS and profiling at
                 intermittent points in the optimization process and use
                 the results to judiciously undo the impact of
                 optimization passes predicted to be damaging to the
                 generated hardware quality. Results show that our
                 approach produces circuits with 16\% better speed
                 performance, on average, versus using the standard {\tt
                 -O3} optimization level.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "14",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Niu:2015:AEI,
  author =       "Xinyu Niu and Thomas C. P. Chau and Qiwei Jin and
                 Wayne Luk and Qiang Liu and Oliver Pell",
  title =        "Automating Elimination of Idle Functions by Runtime
                 Reconfiguration",
  journal =      j-TRETS,
  volume =       "8",
  number =       "3",
  pages =        "15:1--15:??",
  month =        may,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2700415",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue May 19 17:05:24 MDT 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "A design approach is proposed to automatically
                 identify and exploit runtime reconfiguration
                 opportunities with optimised resource utilisation by
                 eliminating idle functions. We introduce
                 Reconfiguration Data Flow Graph, a hierarchical graph
                 structure enabling reconfigurable designs to be
                 synthesised in three steps: function analysis,
                 configuration organisation, and runtime solution
                 generation. The synthesised reconfigurable designs are
                 dynamically evaluated and selected under various
                 runtime conditions. Three applications-barrier option
                 pricing, particle filter, and reverse time
                 migration-are used in evaluating the proposed approach.
                 The runtime solutions approximate their theoretical
                 performance by eliminating idle functions and are 1.31
                 to 2.19 times faster than optimised static designs.
                 FPGA designs developed with the proposed approach are
                 up to 43.8 times faster than optimised CPU reference
                 designs and 1.55 times faster than optimised GPU
                 designs.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "15",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Bhasin:2015:EFB,
  author =       "Shivam Bhasin and Jean-Luc Danger and Sylvain Guilley
                 and Wei He",
  title =        "Exploiting {FPGA} Block Memories for Protected
                 Cryptographic Implementations",
  journal =      j-TRETS,
  volume =       "8",
  number =       "3",
  pages =        "16:1--16:??",
  month =        may,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2629552",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue May 19 17:05:24 MDT 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Modern field programmable gate arrays (FPGAs) are
                 power packed with features to facilitate designers.
                 Availability of features like large block memory
                 (BRAM), digital signal processing cores, and embedded
                 CPU makes the design strategy of FPGAs quite different
                 from ASICs. FPGAs are also widely used in
                 security-critical applications where protection against
                 known attacks is of prime importance. We focus on
                 physical attacks that target physical implementations.
                 To design countermeasures against such attacks, the
                 strategy for FPGA designers should be different from
                 that in ASIC. The available features should be
                 exploited to design compact and strong countermeasures.
                 In this article, we propose methods to exploit the
                 BRAMs in FPGAs for designing compact countermeasures.
                 Internal BRAM can be used to optimize intrinsic
                 countermeasures such as masking and dual-rail logics,
                 which otherwise have significant overhead (at least $ 2
                 \times $) compared to unprotected ones. The
                 optimizations are applied on a real AES-128
                 co-processor and tested for area overhead and
                 resistance on Xilinx Virtex-5 chips. The presented
                 masking countermeasure has an overhead of only 16\%
                 when applied on AES. Moreover, the dual-rail precharge
                 logic (DPL) countermeasure has been optimized to pack
                 the whole sequential part in the BRAM, hence enhancing
                 the security. Proper robustness evaluations are
                 conducted to analyze the optimization in terms of area
                 and security.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "16",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Eusse:2015:CNP,
  author =       "Juan Fernando Eusse and Christopher Williams and
                 Rainer Leupers",
  title =        "{CoEx}: a Novel Profiling-Based Algorithm\slash
                 Architecture Co-Exploration for {ASIP} Design",
  journal =      j-TRETS,
  volume =       "8",
  number =       "3",
  pages =        "17:1--17:??",
  month =        may,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2629563",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue May 19 17:05:24 MDT 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Application-Specific Instruction Set Processors
                 (ASIPs) provide the adequate performance/efficiency
                 tradeoff for their particular application domain.
                 Nevertheless, their design methodologies have stagnated
                 during the past decade and are still based on a series
                 of manual and time-consuming iterative steps.
                 Furthermore, there exists a productivity gap between
                 the point where an application is given as the target
                 for processor customization and the time a customized
                 architecture is available. Therefore, new tools are
                 required that reduce the number of design iterations
                 and bridge the aforementioned productivity gap. This
                 can be achieved by (1) profiling technologies that, by
                 adapting to the designer's needs, help to gain insight
                 into application specifications, and (2)
                 prearchitectural design technologies that give early
                 yet accurate feedback on the impact of
                 algorithmic/architectural design decisions. The first
                 requirement is addressed in this article by proposing
                 the multigrained profiling approach, which identifies
                 the profiling needs at each step of ASIP design and
                 lets the designer tailor the level of detail for
                 application inspection. CoEx, a practical
                 implementation of the approach, is also introduced. The
                 second requirement is addressed by creating a
                 prearchitectural estimation engine. This engine couples
                 CoEx reports for an application with an abstract
                 processor model and generates an estimate of the
                 achievable performance. Both CoEx and the performance
                 estimation engine are respectively evaluated for
                 instrumentation-induced execution overhead and
                 accuracy. Finally, the development of an ASIP
                 architecture for an augmented reality computer vision
                 application is presented. The ASIP achieves a gain of
                 six times compared to the original application
                 performance, after being developed in only 2 days.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "17",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Das:2015:ETD,
  author =       "Anup Das and Amit Kumar Singh and Akash Kumar",
  title =        "Execution Trace-Driven Energy-Reliability Optimization
                 for Multimedia {MPSoCs}",
  journal =      j-TRETS,
  volume =       "8",
  number =       "3",
  pages =        "18:1--18:??",
  month =        may,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2665071",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue May 19 17:05:24 MDT 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Multiprocessor systems-on-chip (MPSoCs) are becoming a
                 popular design choice in current and future technology
                 nodes to accommodate the heterogeneous computing demand
                 of a multitude of applications enabled on these
                 platform. Streaming multimedia and other
                 communication-centric applications constitute a
                 significant fraction of the application space of these
                 devices. The mapping of an application on an MPSoC is
                 an NP-hard problem. This has attracted researchers to
                 solve this problem both as stand-alone (best-effort)
                 and in conjunction with other optimization objectives,
                 such as energy and reliability. Most existing studies
                 on energy-reliability joint optimization are
                 static-that is, design time based. These techniques
                 fail to capture runtime variability such as resource
                 unavailability and dynamism associated with application
                 behaviors, which are typical of multimedia
                 applications. The few studies that consider dynamic
                 mapping of applications do not consider throughput
                 degradation, which directly impacts user satisfaction.
                 This article proposes a runtime technique to analyze
                 the execution trace of an application modeled as
                 Synchronous Data Flow Graphs (SDFGs) to determine its
                 mapping on a multiprocessor system with heterogeneous
                 processing units for different fault scenarios.
                 Further, communication energy is minimized for each of
                 these mappings while satisfying the throughput
                 constraint. Experiments conducted with synthetic and
                 real SDFGs demonstrate that the proposed technique
                 achieves significant improvement with respect to the
                 state-of-the-art approaches in terms of throughput and
                 storage overhead with less than 20\% energy overhead.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "18",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Ren:2015:EFT,
  author =       "Yu Ren and Leibo Liu and Shouyi Yin and Jie Han and
                 Shaojun Wei",
  title =        "Efficient Fault-Tolerant Topology Reconfiguration
                 Using a Maximum Flow Algorithm",
  journal =      j-TRETS,
  volume =       "8",
  number =       "3",
  pages =        "19:1--19:??",
  month =        may,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2700417",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue May 19 17:05:24 MDT 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "With an increasing number of processing elements (PEs)
                 integrated on a single chip, fault-tolerant techniques
                 are critical to ensure the reliability of such complex
                 systems. In current reconfigurable architectures,
                 redundant PEs are utilized for fault tolerance. In the
                 presence of faulty PEs, the physical topologies of
                 various chips may be different, so the concept of
                 virtual topology from network embedding problem has
                 been used to alleviate the burden for the operating
                 systems. With limited hardware resources, how to
                 reconfigure a system into the most effective virtual
                 topology such that the maximum repair rate can be
                 reached presents a significant challenge. In this
                 article, a new approach using a maximum flow (MF)
                 algorithm is proposed for an efficient topology
                 reconfiguration in reconfigurable architectures. In
                 this approach, topology reconfiguration is converted
                 into a network flow problem by constructing a directed
                 graph; the solution is then found by using the MF
                 algorithm. This approach optimizes the use of spare PEs
                 with minimal impacts on area, throughput, and delay,
                 and thus it significantly improves the repair rate of
                 faulty PEs. In addition, it achieves a polynomial
                 reconfiguration time. Experimental results show that
                 compared to previous methods, the MF approach increases
                 the probability to repair faulty PEs by up to 50\%
                 using the same redundant resources. Compared to a
                 fault-free system, the throughput only decreases by
                 less than 2.5\% and latency increases by less than 4\%.
                 To consider various types of PEs in a practical
                 application, a cost factor is introduced into the MF
                 algorithm. An enhanced approach using a minimum-cost MF
                 algorithm is further shown to be efficient in the
                 fault-tolerant reconfiguration of heterogeneous
                 reconfigurable architectures.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "19",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Dobai:2015:LLF,
  author =       "Roland Dobai and Lukas Sekanina",
  title =        "Low-Level Flexible Architecture with Hybrid
                 Reconfiguration for Evolvable Hardware",
  journal =      j-TRETS,
  volume =       "8",
  number =       "3",
  pages =        "20:1--20:??",
  month =        may,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2700414",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue May 19 17:05:24 MDT 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Field-programmable gate arrays (FPGAs) can be
                 considered to be the most popular and successful
                 platform for evolvable hardware. They allow one to
                 establish and later reconfigure candidate solutions.
                 Recent work in the field of evolvable hardware includes
                 the use of virtual and native reconfigurations. Virtual
                 reconfiguration is based on the change of functionality
                 by hardware components implemented on top of FPGA
                 resources. Native reconfiguration changes the FPGA
                 resources directly by means provided by the FPGA
                 manufacturer. Both of these approaches have their
                 disadvantages. The virtual reconfiguration is
                 characterized by lower maximal operational frequency of
                 the resulting solutions, and the native reconfiguration
                 is slower. In this work, a hybrid approach is used
                 merging the advantages while limiting the disadvantages
                 of the virtual and native reconfigurations. The main
                 contribution is the new low-level architecture for
                 evolvable hardware in the new Zynq-7000
                 all-programmable system-on-chip. The proposed
                 architecture offers high flexibility in comparison with
                 other evolvable hardware systems by considering direct
                 modification of the reconfigurable resources. The
                 impact of the higher reconfiguration time of the native
                 approach is limited by the dense placement of the
                 proposed reconfigurable processing elements. These
                 processing elements also ensure fast evaluation of
                 candidate solutions. The proposed architecture is
                 evaluated by evolutionary design of switching image
                 filters and edge detectors. The experimental results
                 demonstrate advantages over the previous approaches
                 considering the time required for evolution, area
                 overhead, and flexibility.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "20",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Kirchgessner:2015:LOF,
  author =       "Robert Kirchgessner and Alan D. George and Greg
                 Stitt",
  title =        "Low-Overhead {FPGA} Middleware for Application
                 Portability and Productivity",
  journal =      j-TRETS,
  volume =       "8",
  number =       "4",
  pages =        "21:1--21:??",
  month =        oct,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2746404",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Mon Oct 5 08:47:01 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Reconfigurable computing devices such as
                 field-programmable gate arrays (FPGAs) offer advantages
                 over fixed-logic CPU and GPU architectures, including
                 improved performance, superior power efficiency, and
                 reconfigurability. The challenge of FPGA application
                 development, however, has limited their acceptance in
                 high-performance computing and high-performance
                 embedded computing applications. FPGA development
                 carries similar difficulties to hardware design,
                 requiring that developers iterate through
                 register-transfer level designs with cycle-level
                 accuracy. Furthermore, the lack of hardware and
                 software standards between FPGA platforms limits
                 productivity and application portability, and makes
                 porting applications between heterogeneous platforms a
                 time-consuming and often challenging process. Recent
                 efforts to improve FPGA productivity using high-level
                 synthesis tools and languages show promise, but
                 platform support remains limited and typically is left
                 as a challenge for developers. To address these issues,
                 we present RC Middleware (RCMW), a novel middleware
                 that improves productivity and enables application and
                 tool portability by abstracting away platform-specific
                 details. RCMW provides an application-centric
                 development environment, exposing only the resources
                 and standardized interfaces required by an application,
                 independent of the underlying platform. We demonstrate
                 the portability and productivity benefits of RCMW using
                 four heterogeneous platforms from three vendors. Our
                 results indicate that RCMW enables application
                 productivity and improves developer productivity, and
                 that these benefits are achieved with less than 7\%
                 performance and 3\% area overhead on average.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "21",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Jacobsen:2015:RRI,
  author =       "Matthew Jacobsen and Dustin Richmond and Matthew
                 Hogains and Ryan Kastner",
  title =        "{RIFFA 2.1}: a Reusable Integration Framework for
                 {FPGA} Accelerators",
  journal =      j-TRETS,
  volume =       "8",
  number =       "4",
  pages =        "22:1--22:??",
  month =        oct,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2815631",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Mon Oct 5 08:47:01 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "We present RIFFA 2.1, a reusable integration framework
                 for Field-Programmable Gate Array (FPGA) accelerators.
                 RIFFA provides communication and synchronization for
                 FPGA accelerated applications using simple interfaces
                 for hardware and software. Our goal is to expand the
                 use of FPGAs as an acceleration platform by releasing,
                 as open source, a framework that easily integrates
                 software running on commodity CPUs with FPGA cores.
                 RIFFA uses PCI Express (PCIe) links to connect FPGAs to
                 a CPU's system bus. RIFFA 2.1 supports FPGAs from
                 Xilinx and Altera, Linux and Windows operating systems,
                 and allows multiple FPGAs to connect to a single host
                 PC system. It has software bindings for C/C++, Java,
                 Python, and Matlab. Tests show that data transfers
                 between hardware and software can reach 97\% of the
                 achievable PCIe link bandwidth.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "22",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Thomas:2015:THG,
  author =       "David B. Thomas",
  title =        "The Table-{Hadamard} {GRNG}: an Area-Efficient {FPGA}
                 {Gaussian} Random Number Generator",
  journal =      j-TRETS,
  volume =       "8",
  number =       "4",
  pages =        "23:1--23:??",
  month =        oct,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2629607",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Mon Oct 5 08:47:01 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/prng.bib;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Gaussian random number generators (GRNGs) are an
                 important component in parallel Monte Carlo simulations
                 using FPGAs, where tens or hundreds of high-quality
                 Gaussian samples must be generated per cycle using very
                 few logic resources. This article describes the
                 Table-Hadamard generator, which is a GRNG designed to
                 generate multiple streams of random numbers in
                 parallel. It uses discrete table distributions to
                 generate pseudo-Gaussian base samples, then a parallel
                 Hadamard transform to efficiently apply the central
                 limit theorem. When generating 64 output samples, the
                 Table-Hadamard requires just 130 slices per generated
                 sample, which is a third of the resources needed by the
                 next best technique, while still providing higher
                 statistical quality.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "23",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Jin:2015:MID,
  author =       "Zheming Jin and Jason D. Bakos",
  title =        "Memory Interface Design for {$3$D} Stencil Kernels on
                 a Massively Parallel Memory System",
  journal =      j-TRETS,
  volume =       "8",
  number =       "4",
  pages =        "24:1--24:??",
  month =        oct,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2800788",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Mon Oct 5 08:47:01 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Massively parallel memory systems are designed to
                 deliver high bandwidth at relatively low clock speed
                 for memory-intensive applications implemented on
                 programmable logic. For example, the Convey HC-1
                 provides 1,024 DRAM banks to each of four FPGAs through
                 a full crossbar, presenting a peak bandwidth of
                 76.8GB/s to the user logic. Such highly parallel memory
                 systems suffer from high latency, and their effective
                 bandwidth is highly sensitive to access ordering. To
                 achieve high performance, the user must use a
                 customized memory interface that combines scheduling,
                 latency hiding, and data reuse. In this article, we
                 describe the design of a custom memory interface for 3D
                 stencil kernels on the Convey HC-1 that incorporates
                 these features. Experimental results show that the
                 proposed memory interface achieves a speedup in runtime
                 of 2.2 for 6-point stencil and 9.5 for 27-point stencil
                 when compared to a naive memory interface.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "24",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Tan:2015:SHP,
  author =       "Guangming Tan and Chunming Zhang and Wendi Wang and
                 Peiheng Zhang",
  title =        "{SuperDragon}: a Heterogeneous Parallel System for
                 Accelerating {$3$D} Reconstruction of Cryo-Electron
                 Microscopy Images",
  journal =      j-TRETS,
  volume =       "8",
  number =       "4",
  pages =        "25:1--25:??",
  month =        oct,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2740966",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Mon Oct 5 08:47:01 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "The data deluge in medical imaging processing requires
                 faster and more efficient systems. Due to the advance
                 in recent heterogeneous architecture, there has been a
                 resurgence in research aimed at domain-specific
                 accelerators. In this article, we develop an
                 experimental system SuperDragon for evaluating
                 acceleration of a single-particle Cryo-electron
                 microscopy (Cryo-EM) 3D reconstruction package EMAN
                 through a hybrid of CPU, GPU, and FPGA parallel
                 architecture. Based on a comprehensive workload
                 characterization, we exploit multigrained parallelism
                 in the Cryo-EM 3D reconstruction algorithm and
                 investigate a proper computational mapping to the
                 underlying heterogeneous architecture. The package is
                 restructured with task-level (MPI), thread-level
                 (OpenMP), and data-level (GPU and FPGA) parallelism.
                 Especially, the proposed FPGA accelerator is a stream
                 architecture that emphasizes the importance of
                 optimizing computing dominated data access patterns.
                 Besides, the configurable computing streams are
                 constructed by arranging the hardware modules and
                 bypassing channels to form a linear deep pipeline.
                 Compared to the multicore (six-core) program, the GPU
                 and FPGA implementations achieve speedups of 8.4 and
                 2.25 times in execution time while improving power
                 efficiency by factors of 7.2 and 14.2, respectively.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "25",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Biedermann:2015:SDR,
  author =       "Alexander Biedermann and Sorin A. Huss and Adeel
                 Israr",
  title =        "Safe Dynamic Reshaping of Reconfigurable {MPSoC}
                 Embedded Systems for Self-Healing and Self-Adaption
                 Purposes",
  journal =      j-TRETS,
  volume =       "8",
  number =       "4",
  pages =        "26:1--26:??",
  month =        oct,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2700416",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Mon Oct 5 08:47:01 MDT 2015",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Multiprocessor system-on-chip (MPSoC) architectures
                 are a huge challenge in embedded system design. This
                 situation arises from the fact that available MPSoCs
                 and related designs flows are not tailored to the
                 specific needs of embedded systems. This work
                 demonstrates how to provide self-healing properties in
                 embedded MPSoC design. This is achieved by combining
                 the features of a generic approach to create
                 virtualizable MPSoCs out of off-the-shelf embedded
                 processors with a methodology to derive system
                 configurations, such as task-processor bindings, which
                 are optimal in terms of safety and execution time. The
                 virtualization properties enable a reshaping of the
                 MPSoC at runtime. Thus, system configurations may be
                 exchanged rapidly in a dynamic fashion. As a main
                 result of this work, embedded multiprocessor systems
                 are introduced, which dynamically adapt to changing
                 operating conditions, possible module defects, and
                 internal state changes. We demonstrate the figures of
                 merit of such reconfigurable MPSoC embedded systems by
                 means of a complex automotive application scenario
                 mapped to an FPGA featuring a virtualizable array of
                 eight soft-core processors.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "26",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Park:2015:PIC,
  author =       "Joonseok Park and Pedro C. Diniz",
  title =        "Program-Invariant Checking for Soft-Error Detection
                 using Reconfigurable Hardware",
  journal =      j-TRETS,
  volume =       "9",
  number =       "1",
  pages =        "1:1--1:??",
  month =        nov,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2751563",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Dec 22 16:19:56 MST 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "There is an increasing concern about transient errors
                 in deep submicron processor architectures.
                 Software-only error detection approaches that exploit
                 program invariants for silent error detection incur
                 large execution overheads and are unreliable as state
                 can be corrupted after invariant checkpoints. In this
                 article, we explore the use of configurable hardware
                 structures for the continuous evaluation of high-level
                 program invariants at the assembly level. We evaluate
                 the resource requirements and performance of the
                 proposed predicate-evaluation hardware structures when
                 integrated with a 32-bit MIPS soft core on a
                 contemporary reconfigurable hardware device. The
                 results, for a small set of kernel codes, reveal that
                 these hardware structures require a very small number
                 of hardware resources with negligible impact on the
                 processor core that they are integrated in. Moreover,
                 the amount of resources is fairly insensitive to the
                 complexity of the invariants, thus making the proposed
                 structures an attractive alternative to software-only
                 predicate checking.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "1",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Scicluna:2015:AMF,
  author =       "Neil Scicluna and Christos-Savvas Bouganis",
  title =        "{ARC 2014}: a Multidimensional {FPGA}-Based Parallel
                 {DBSCAN} Architecture",
  journal =      j-TRETS,
  volume =       "9",
  number =       "1",
  pages =        "2:1--2:??",
  month =        nov,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2724722",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Dec 22 16:19:56 MST 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Clustering large numbers of data points is a very
                 computationally demanding task that often needs to be
                 accelerated in order to be useful in practical
                 applications. This work focuses on the Density-Based
                 Spatial Clustering of Applications with Noise (DBSCAN)
                 algorithm, which is one of the state-of-the-art
                 clustering algorithms, and targets its acceleration
                 using an FPGA device. The article presents an
                 optimized, scalable, and parameterizable architecture
                 that takes advantage of the internal memory structure
                 of modern FPGAs in order to deliver a high-performance
                 clustering system. Post-synthesis simulation results
                 show that the developed system can obtain mean speedups
                 of 31$ \times $ in real-world tests and 202$ \times $
                 in synthetic tests when compared to state-of-the-art
                 software counterparts running on a quad-core 3.4GHz
                 Intel i7-2600k. Additionally, this implementation is
                 also capable of clustering data with any number of
                 dimensions without impacting the performance.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "2",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Sasdrich:2015:ICS,
  author =       "Pascal Sasdrich and Tim G{\"u}neysu",
  title =        "Implementing {Curve25519} for Side-Channel--Protected
                 Elliptic Curve Cryptography",
  journal =      j-TRETS,
  volume =       "9",
  number =       "1",
  pages =        "3:1--3:??",
  month =        nov,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2700834",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Dec 22 16:19:56 MST 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "For security-critical embedded applications Elliptic
                 Curve Cryptography (ECC) has become the predominant
                 cryptographic system for efficient key agreement and
                 digital signatures. However, ECC still involves complex
                 modular arithmetic that is a particular burden for
                 small processors. In this context, Bernstein proposed
                 the highly efficient ECC instance Curve25519 that
                 particularly enables efficient software implementations
                 at a security level comparable to AES-128 with inherent
                 resistance to simple power analysis (SPA) and timing
                 attacks. In this work, we show that Curve25519 is
                 likewise competitive on FPGAs even when countermeasures
                 to thwart side-channel power analysis are included. Our
                 basic multicore DSP-based architectures achieves a
                 maximal performance of more than 32,000 point
                 multiplications per second on a Xilinx Zynq 7020 FPGA.
                 Including a mix of side-channel countermeasures to
                 impede simple and differential power analysis, we still
                 achieve more than 27,500 point multiplications per
                 second with a moderate increase in logic resources.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "3",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Zhang:2015:EAR,
  author =       "Jianfeng Zhang and Paul Chow and Hengzhu Liu",
  title =        "An Enhanced Adaptive Recoding Rotation {CORDIC}",
  journal =      j-TRETS,
  volume =       "9",
  number =       "1",
  pages =        "4:1--4:??",
  month =        nov,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2812813",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Dec 22 16:19:56 MST 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/elefunt.bib;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "The Conventional Coordinate Rotation Digital Computer
                 (CORDIC) algorithm has been widely used in many
                 applications, particularly in Direct Digital Frequency
                 Synthesizers (DDS) and Fast Fourier Transforms (FFT).
                 However, CORDIC is constrained by the excessive number
                 of iterations, angle data path, and scaling factor
                 compensation. In this article, an enhanced adaptive
                 recoding CORDIC (EARC) is proposed. It uses the
                 enhanced adaptive recoding method to reduce the
                 required iterations and adopts the trigonometric
                 transformation scheme to scale up the rotation angles.
                 Computing sine and cosine is used first to compare the
                 core functionality of EARC with basic CORDIC; then a
                 16-bit DDS and a 1,024-point FFT based on EARC are
                 evaluated to demonstrate the benefits of EARC in larger
                 applications. All the proposed architectures are
                 validated on a Virtex 5 FPGA development platform.
                 Compared with a commercial implementation of CORDIC,
                 EARC requires 33.3\% less hardware resources, provides
                 a twofold speedup, dissipates 70.4\% less power, and
                 improves accuracy in terms of the Bit Error Position
                 (BEP). Compared to the state-of-the-art Hybrid CORDIC,
                 EARC reduces latency by 11.1\% and consumes 17\% less
                 power. Compared with a commercial implementation of
                 DDS, the dissipated power of the proposed DDS is
                 reduced by 27.2\%. The proposed DDS improves
                 Spurious-Free Dynamic Range (SFDR) by nearly 7 dBc and
                 dissipates 21.8\% less power when compared with a
                 recently published DDS circuit. The FFT based on EARC
                 dissipates a factor of 2.05 less power than the
                 commercial FFT even when choosing the 100\% toggle rate
                 for the FFT based on EARC and the 12.5\% toggle rate
                 for the commercial FFT. Compared with a recently
                 published FFT, the FFT based on EARC improves
                 Signal-to-Noise Ratio (SNR) by 8.9 dB and consumes
                 7.78\% less power.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "4",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Goehringer:2015:GEA,
  author =       "Diana Goehringer and Marco D. Santambrogio and
                 Jo{\~a}o M. P. Cardoso and Koen Bertels",
  title =        "Guest Editorial: {ARC 2014}",
  journal =      j-TRETS,
  volume =       "9",
  number =       "1",
  pages =        "5:1--5:??",
  month =        nov,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2831431",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Dec 22 16:19:56 MST 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "5",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Heyse:2015:IRL,
  author =       "Karel Heyse and Jente Basteleus and Brahim {Al Farisi}
                 and Dirk Stroobandt and Oliver Kadlcek and Oliver
                 Pell",
  title =        "On the Impact of Replacing Low-Speed Configuration
                 Buses on {FPGAs} with the Chip's Internal Configuration
                 Infrastructure",
  journal =      j-TRETS,
  volume =       "9",
  number =       "1",
  pages =        "6:1--6:??",
  month =        nov,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2700835",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Dec 22 16:19:56 MST 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "It is common for large hardware designs to have a
                 number of registers or memories whose contents have to
                 be changed very seldom (e.g., only at startup). The
                 conventional way of accessing these memories is through
                 a low-speed memory bus. This bus uses valuable hardware
                 resources, introduces long global connections, and
                 contributes to routing congestion. Hence, it has an
                 impact on the overall design even though it is only
                 rarely used. A Field-Programmable Gate Array (FPGA)
                 already contains a global communication mechanism in
                 the form of its configuration infrastructure. In this
                 article, we evaluate the use of the configuration
                 infrastructure as a replacement for a low-speed memory
                 bus on the Maxeler HPC platform. We find that by
                 removing the conventional low-speed memory bus, the
                 maximum clock frequency of some applications can be
                 improved by 8\%. Improvements by 25\% and more are also
                 attainable, but constraints of the Xilinx
                 reconfiguration infrastructure prevent fully exploiting
                 these benefits at the moment. We present a number of
                 possible changes to the Xilinx reconfiguration
                 infrastructure and tools that would solve this and make
                 these results more widely applicable.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "6",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Duarte:2015:ACK,
  author =       "Rui Policarpo Duarte and Christos-Savvas Bouganis",
  title =        "{ARC 2014} Over-Clocking {KLT} Designs on {FPGAs}
                 under Process, Voltage, and Temperature Variation",
  journal =      j-TRETS,
  volume =       "9",
  number =       "1",
  pages =        "7:1--7:??",
  month =        nov,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2818380",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Dec 22 16:19:56 MST 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Karhunen-Loeve Transformation is a widely used
                 algorithm in signal processing that often implemented
                 with high-throughput requisites. This work presents a
                 novel methodology to optimise KLT designs on FPGAs that
                 outperform typical design methodologies, through a
                 prior characterisation of the arithmetic units in the
                 datapath of the circuit under various operating
                 conditions. Limited by the ever-increasing process
                 variation, the delay models available in synthesis
                 tools are no longer suitable for extreme performance
                 optimisation of designs, and as they are generic, they
                 need to consider the worst-case performance for a given
                 fabrication process. Hence, they heavily penalise the
                 maximum possible achieved performance of a design by
                 leaving safety margin. This work presents a novel
                 unified optimisation framework which contemplates a
                 prior characterisation of the embedded multipliers on
                 the target FPGA device under process, voltage, and
                 temperature variation. The proposed framework allows a
                 design space exploration leading to designs without any
                 latency overheads that achieve high throughput while
                 producing less errors than typical methodologies,
                 operating with the same throughput. Experimental
                 results demonstrate that the proposed methodology
                 outperforms the typical implementation in three
                 real-life design strategies: high performance, low
                 power, and temperature variation; and it produced
                 circuit designs that performed up to 18dB better when
                 over-clocked.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "7",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Bai:2015:ATF,
  author =       "Yuhui Bai and Syed Zahid Ahmed and Bertrand Granado",
  title =        "{ARC 2014}: Towards a Fast {FPGA} Implementation of a
                 Heap-Based Priority Queue for Image Coding Using a
                 Parallel Index-Aware Tree",
  journal =      j-TRETS,
  volume =       "9",
  number =       "1",
  pages =        "8:1--8:??",
  month =        nov,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2766454",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Dec 22 16:19:56 MST 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "The embedded image processing systems like smartphones
                 and digital cameras have tight limits on storage,
                 computation power, network connectivity, and battery
                 usage. These limitations make it important to ensure
                 efficient image coding. In the article, we present a
                 novel heap-based priority queue structure employed by
                 an Adaptive Scanning of Wavelet Data scheme (ASWD)
                 targeting an embedded platform. ASWD is a context
                 modeling block implemented via priority queues in a
                 wavelet-based image coder to reorganize the wavelet
                 coefficients into locally stationary sequences. The
                 architecture we propose exploits efficient use of
                 FPGA's on-chip dual-port memories in an adaptive
                 manner. Innovations of index-aware system linked to
                 each element in the queue makes the location of queue
                 element traceable in the heap as per the requirements
                 of the ASWD algorithm. Moreover, use of 4-port memories
                 along with intelligent data concatenation of queue
                 elements yielded in a cost effective enhanced memory
                 access. The memory ports are adaptively assigned to
                 different units during different processing phases in a
                 manner to optimally take advantage of memory access
                 required by that phase. The architectural innovations
                 can also be exploited in other applications that
                 require efficient hardware implementations of generic
                 priority queue or classical sorting applications which
                 sort into the index. We designed and validated the
                 hardware on an Altera's Stratix IV FPGA as an IP
                 accelerator in a Nios II processor based System on
                 Chip. We show that our architecture at 150MHz can
                 provide 45X speedup compared to an embedded ARM
                 Cortex-A9 processor at 666MHz targeting the throughput
                 of 10MB/s.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "8",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Zhang:2016:CBE,
  author =       "Jianfeng Zhang and Paul Chow and Hengzhu Liu",
  title =        "{CORDIC}-Based Enhanced Systolic Array Architecture
                 for {$ Q R $} Decomposition",
  journal =      j-TRETS,
  volume =       "9",
  number =       "2",
  pages =        "9:1--9:??",
  month =        feb,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2827700",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Dec 22 16:19:57 MST 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Multiple input multiple output (MIMO) with orthogonal
                 frequency division multiplexing (OFDM) systems
                 typically use orthogonal-triangular (QR) decomposition.
                 In this article, we present an enhanced systolic array
                 architecture to realize QR decomposition based on the
                 Givens rotation (GR) method for a 4 $ \times $ 4 real
                 matrix. The coordinate rotation digital computer
                 (CORDIC) algorithm is adopted and modified to speed up
                 and simplify the process of GR. To verify the function
                 and evaluate the performance, the proposed
                 architectures are validated on a Virtex 5 FPGA
                 development platform. Compared to a commercial
                 implementation of vectoring CORDIC, the enhanced
                 vectoring CORDIC is presented that uses 37.7\% less
                 hardware resources, dissipates 71.6\% less power, and
                 provides a 1.8 times speedup while maintaining the same
                 computation accuracy. The enhanced QR systolic array
                 architecture based on the enhanced vectoring CORDIC
                 saves 24.5\% in power dissipation, provides a factor of
                 1.5-fold improvement in throughput, and the hardware
                 efficiency is improved 1.45-fold with no accuracy
                 penalty when compared to our previously proposed QR
                 systolic array architecture.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "9",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Winterstein:2016:SLH,
  author =       "Felix J. Winterstein and Samuel R. Bayliss and George
                 A. Constantinides",
  title =        "Separation Logic for High-Level Synthesis",
  journal =      j-TRETS,
  volume =       "9",
  number =       "2",
  pages =        "10:1--10:??",
  month =        feb,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2836169",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Dec 22 16:19:57 MST 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "High-Level Synthesis (HLS) promises a significant
                 shortening of the FPGA design cycle by raising the
                 abstraction level of the design entry to high-level
                 languages such as C/C++. However, applications using
                 dynamic, pointer-based data structures and dynamic
                 memory allocation remain difficult to implement well,
                 yet such constructs are widely used in software.
                 Automated optimizations that leverage the memory
                 bandwidth of FPGAs by distributing the application data
                 over separate banks of on-chip memory are often
                 ineffective in the presence of dynamic data structures
                 due to the lack of an automated analysis of
                 pointer-based memory accesses. In this work, we take a
                 step toward closing this gap. We present a static
                 analysis for pointer-manipulating programs that
                 automatically splits heap-allocated data structures
                 into disjoint, independent regions. The analysis
                 leverages recent advances in separation logic, a
                 theoretical framework for reasoning about
                 heap-allocated data that has been successfully applied
                 in recent software verification tools. Our algorithm
                 focuses on dynamic data structures accessed in loops
                 and is accompanied by automated source-to-source
                 transformations that enable automatic loop
                 parallelization and memory partitioning by
                 off-the-shelf HLS tools. We demonstrate the successful
                 loop parallelization and memory partitioning by our
                 tool flow using three real-life applications that
                 build, traverse, update, and dispose of dynamically
                 allocated data structures. Our case studies, comparing
                 the automatically parallelized to the direct HLS
                 implementations, show an average latency reduction by a
                 factor of 2 $ \times $ across our benchmarks.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "10",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Xu:2016:CGA,
  author =       "Jinwei Xu and Jingfei Jiang and Yong Dou and Xiaolong
                 Shen and Zhiqiang Liu",
  title =        "Coarse-Grained Architecture for Fingerprint Matching",
  journal =      j-TRETS,
  volume =       "9",
  number =       "2",
  pages =        "12:1--12:??",
  month =        feb,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2791296",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Dec 22 16:19:57 MST 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Fingerprint matching is a key procedure in fingerprint
                 identification applications. The minutiae-based
                 fingerprint matching algorithm is one of the most
                 typical algorithms achieving a reasonably correct
                 recognition rate. This study proposes a coarse-grained
                 parallel architecture called fingerprint matching core
                 (FMC) to accelerate fingerprint matching. The proposed
                 architecture has a two-level parallel structure (i.e.,
                 parallel among groups (PAG) and parallel in group
                 (PIG)). A multirequest controller is added to the PAG
                 structure to obtain a concurrent operation of the
                 multiple processing element group (PEG). The DDR3
                 controller is used in the PIG structure to read eight
                 minutiae from eight different fingerprints and realize
                 the simultaneous computation of the eight PEs. The
                 whole system is implemented on a Xilinx FPGA board with
                 a Virtex VII XC7VX485T chip. The 16-PEG FMC achieves a
                 throughput of about 9.63 million fingerprint pairs per
                 second, which is larger than that achieved on a Tesla
                 K20c platform. The software execution times are also
                 measured on the 2.93GHz Intel Xeon 5670, 2.3GHz AMD
                 Opteron(tm) Processor 6376, and Tesla K20c platforms.
                 The Intel Xeon 5670 has two processors with 12 cores,
                 and the AMD Opteron(tm) Processor 6376 has two
                 processors with 16 cores. Moreover, the throughput is
                 about 31 times that achieved on a 2.93GHz Intel Xeon
                 5670 single core.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "12",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Zaidi:2016:VSF,
  author =       "Ali Mustafa Zaidi and David Greaves",
  title =        "Value State Flow Graph: a Dataflow Compiler {IR} for
                 Accelerating Control-Intensive Code in Spatial
                 Hardware",
  journal =      j-TRETS,
  volume =       "9",
  number =       "2",
  pages =        "14:1--14:??",
  month =        feb,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2807702",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Dec 22 16:19:57 MST 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Although custom (and reconfigurable) computing can
                 provide orders-of-magnitude improvements in energy
                 efficiency and performance for many numeric,
                 data-parallel applications, performance on nonnumeric,
                 sequential code is often worse than conventional
                 superscalar processors. This work attempts to improve
                 sequential performance in custom hardware by (a)
                 switching from a statically scheduled to a dynamically
                 scheduled (dataflow) execution model and (b) developing
                 a new compiler IR for high-level synthesis-the value
                 state flow graph (VSFG)-that enables aggressive
                 exposition of ILP even in the presence of complex
                 control flow. Compared to existing control-data flow
                 graph (CDFG)-based IRs, the VSFG exposes more
                 instruction-level parallelism from control-intensive
                 sequential code by exploiting aggressive speculation,
                 enabling control dependence analysis, as well as
                 execution along multiple flows of control. This new IR
                 is directly implemented as a static-dataflow graph in
                 hardware by our prototype high-level synthesis tool
                 chain and shows an average speedup of 1.13$ \times $
                 over equivalent hardware generated using LegUp, an
                 existing CDFG-based HLS tool. Furthermore, the VSFG
                 allows us to further trade area and energy for
                 performance through loop unrolling, increasing the
                 average speedup to 1.55$ \times $, with a peak speedup
                 of 4.05$ \times $. Our VSFG-based hardware approaches
                 the sequential cycle counts of an Intel Nehalem Core i7
                 processor while consuming only 0.25$ \times $ the
                 energy of an in-order Altera Nios II f processor.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "14",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Raitza:2016:RRN,
  author =       "Michael Raitza and Markus Vogt and Christian
                 Hochberger and Thilo Pionteck",
  title =        "{RAW 2014}: Random Number Generators on {FPGAs}",
  journal =      j-TRETS,
  volume =       "9",
  number =       "2",
  pages =        "15:1--15:??",
  month =        feb,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2807699",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Dec 22 16:19:57 MST 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/prng.bib;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Random numbers are important ingredients in a number
                 of applications. Especially in a security context, they
                 must be well distributed and unpredictable. We
                 investigate the practical use of random number
                 generators (RNGs) that are built from digital elements
                 found in FPGAs. For this, we implement different types
                 of ring oscillators (ROs) and memory collision-based
                 circuits on FPGAs from major vendors. Implementing RNGs
                 on the same device as the rest of the system benefits
                 an overall reduction of vulnerability to attacks and
                 wire tapping. Nevertheless, we investigate different
                 attacks by tampering with power supply, chip
                 temperature, and by exposition to strong magnetic
                 fields and X-radiation. We also consider their
                 usability as massively deployed components, whose
                 functionality cannot be tested individually anymore, by
                 conducting a technology invariance experiment. Our
                 experiments show that BlockRAM-based RNGs cannot be
                 considered as a suitable entropy source. We further
                 show that RO-based RNGs work reliably under a wide
                 range of operating conditions. While magnetic fields
                 and X-rays did not induce any notable change, voltage
                 and temperature variations caused an increase in
                 propagation delays within the circuits. We show how
                 reliable RNGs can be constructed and deployed on
                 FPGAs.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "15",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Attia:2016:RAD,
  author =       "Osama G. Attia and Kevin R. Townsend and Phillip H.
                 Jones and Joseph Zambreno",
  title =        "A Reconfigurable Architecture for the Detection of
                 Strongly Connected Components",
  journal =      j-TRETS,
  volume =       "9",
  number =       "2",
  pages =        "16:1--16:??",
  month =        feb,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2807700",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Dec 22 16:19:57 MST 2015",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "The Strongly Connected Components (SCCs) detection
                 algorithm serves as a keystone for many graph analysis
                 applications. The SCC execution time for large-scale
                 graphs, as with many other graph algorithms, is
                 dominated by memory latency. In this article, we
                 investigate the design of a parallel hardware
                 architecture for the detection of SCCs in directed
                 graphs. We propose a design methodology that alleviates
                 memory latency and problems with irregular memory
                 access. The design is composed of 16 processing
                 elements dedicated to parallel Breadth-First Search
                 (BFS) and eight processing elements dedicated to
                 finding intersection in parallel. Processing elements
                 are organized to reuse resources and utilize memory
                 bandwidth efficiently. We demonstrate a prototype of
                 our design using the Convey HC-2 system, a commercial
                 high-performance reconfigurable computing coprocessor.
                 Our experimental results show a speedup of as much as
                 17$ \times $ for detecting SCCs in large-scale graphs
                 when compared to a conventional sequential software
                 implementation.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "16",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Kapre:2016:OSV,
  author =       "Nachiket Kapre",
  title =        "Optimizing Soft Vector Processing in {FPGA}-Based
                 Embedded Systems",
  journal =      j-TRETS,
  volume =       "9",
  number =       "3",
  pages =        "17:1--17:??",
  month =        jul,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2912884",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Thu Jul 14 16:35:43 MDT 2016",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Soft vector processors can augment and extend the
                 capability of FPGA-based embedded systems-on-chip such
                 as the Xilinx Zynq. However, configuring and optimizing
                 the soft processor for best performance is hard. We
                 must consider architectural parameters such as
                 precision, vector lane count, vector length, chunk
                 size, and DMA scheduling to ensure efficient execution
                 of code on the soft vector processing platform. To
                 simplify the design process, we develop a compiler
                 framework and an autotuning runtime that splits the
                 optimization into a combination of static and dynamic
                 passes that map data-parallel computations to the soft
                 processor. We compare and contrast implementations
                 running on the scalar ARM processor, the embedded NEON
                 hard vector engine, and low-level streaming Verilog
                 designs with the VectorBlox MXP soft vector processor.
                 Across a range of data-parallel benchmarks, we show
                 that the MXP soft vector processor can outperform other
                 organizations by up to $ 4 \times $ while saving $
                 \approx 10 \% $ dynamic power. Our compilation and
                 runtime framework is also able to outperform the gcc
                 NEON vectorizer under certain conditions by explicit
                 generation of NEON intrinsics and performance tuning of
                 the autogenerated data-parallel code. When constrained
                 by IO bandwidth, soft vector processors are even
                 competitive with spatial Verilog implementations of
                 computation.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "17",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Dehon:2016:ISI,
  author =       "Andr{\'e} Dehon and Derek Chiou",
  title =        "Introduction to Special Issue on Reconfigurable
                 Components with Source Code",
  journal =      j-TRETS,
  volume =       "9",
  number =       "3",
  pages =        "19:1--19:??",
  month =        jul,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2907949",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Thu Jul 14 16:35:43 MDT 2016",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "19",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Fang:2016:OSV,
  author =       "Xin Fang and Miriam Leeser",
  title =        "Open-Source Variable-Precision Floating-Point Library
                 for Major Commercial {FPGAs}",
  journal =      j-TRETS,
  volume =       "9",
  number =       "3",
  pages =        "1--17",
  month =        jul,
  year =         "2016",
  DOI =          "https://doi.org/10.1145/2851507",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Feb 8 10:53:20 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/fparith.bib;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/2851507",
  abstract =     "There is increased interest in implementing
                 floating-point designs for different precisions that
                 take advantage of the flexibility offered by
                 Field-Programmable Gate Arrays (FPGAs). In this
                 article, we present updates to the Variable-precision
                 FLOATing Point Library (VFLOAT) developed at
                 Northeastern University and highlight recent
                 improvements in implementations for implementing
                 reciprocal, division, and square root components that
                 scale to double precision for FPGAs from the two major
                 vendors: Altera and Xilinx. Our library is open source
                 and flexible and provides the user with many options. A
                 designer has many tradeoffs to consider including clock
                 frequency, total latency, and resource usage as well as
                 target architecture. We compare the generated cores to
                 those produced by each vendor and to another popular
                 open-source tool: FloPoCo. VFLOAT has the advantage of
                 not tying the user s design to a specific target
                 architecture and of providing the maximum flexibility
                 for all options including clock frequency and latency
                 compared to other alternatives. Our results show that
                 variable-precision as well as double-precision designs
                 can easily be accommodated and the resulting components
                 are competitive and in many cases superior to the
                 alternatives.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Wilson:2016:UAA,
  author =       "David Wilson and Greg Stitt",
  title =        "The Unified Accumulator Architecture: a Configurable,
                 Portable, and Extensible Floating-Point Accumulator",
  journal =      j-TRETS,
  volume =       "9",
  number =       "3",
  pages =        "21:1--21:??",
  month =        jul,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2809432",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Thu Jul 14 16:35:43 MDT 2016",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/fparith.bib;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Applications accelerated by field-programmable gate
                 arrays (FPGAs) often require pipelined floating-point
                 accumulators with a variety of different trade-offs.
                 Although previous work has introduced numerous
                 floating-point accumulation architectures, few cores
                 are available for public use, which forces designers to
                 use fixed-point implementations or vendor-provided
                 cores that are not portable and are often not optimized
                 for the desired set of trade-offs. In this article, we
                 combine and extend previous floating-point accumulator
                 architectures into a configurable, open-source core,
                 referred to as the unified accumulator architecture
                 (UAA), which enables designers to choose between
                 different trade-offs for different applications. UAA is
                 portable across FPGAs and allows designers to
                 specialize the underlying adder core to take advantage
                 of device-specific optimizations. By providing an
                 extensible, open-source implementation, we hope for the
                 research community to extend the provided core with new
                 architectures and optimizations.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "21",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Abdelhadi:2016:MSM,
  author =       "Ameer M. S. Abdelhadi and Guy G. F. Lemieux",
  title =        "Modular Switched Multiported {SRAM}-Based Memories",
  journal =      j-TRETS,
  volume =       "9",
  number =       "3",
  pages =        "22:1--22:??",
  month =        jul,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2851506",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Thu Jul 14 16:35:43 MDT 2016",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Multiported RAMs are essential for high-performance
                 parallel computation systems. VLIW and vector
                 processors, CGRAs, DSPs, CMPs, and other processing
                 systems often rely upon multiported memories for
                 parallel access. Although memories with a large number
                 of read and write ports are important, their high
                 implementation cost means that they are used sparingly.
                 As a result, FPGA vendors only provide dual-ported
                 block RAMs (BRAMs) to handle the majority of usage
                 patterns. Furthermore, recent attempts to create
                 FPGA-based multiported memories suffer from low storage
                 utilization. Whereas most approaches provide simple
                 unidirectional ports with a fixed read or write, others
                 propose true bidirectional ports where each port
                 dynamically switches read and write. True RAM ports are
                 useful for systems with transceivers and provide high
                 RAM flexibility; however, this flexibility incurs high
                 BRAM consumption. In this article, a novel, modular,
                 and BRAM-based switched multiported RAM architecture is
                 proposed. In addition to unidirectional ports with
                 fixed read/write, this switched architecture allows a
                 group of write ports to switch with another group of
                 read ports dynamically, hence altering the number of
                 active ports. The proposed switched-ports architecture
                 is less flexible than a true-multiported RAM where each
                 port is switched individually. Nevertheless, switched
                 memories can dramatically reduce BRAM consumption
                 compared to true ports for systems with alternating
                 port requirements. Previous live-value-table (LVT) and
                 XOR approaches are merged and optimized into a
                 generalized and modular structure that we call an
                 invalidation-based live-value-table (I-LVT). Like a
                 regular LVT, the I-LVT determines the correct bank to
                 read from, but it differs in how updates to the table
                 are made; the LVT approach requires multiple write
                 ports, often leading to an area-intensive
                 register-based implementation, whereas the XOR approach
                 suffers from excessive storage overhead since wider
                 memories are required to accommodate the XOR-ed data.
                 Two specific I-LVT implementations are proposed and
                 evaluated: binary and thermometer coding. The I-LVT
                 approach is especially suitable for deep memories
                 because the table is implemented only in SRAM cells.
                 The I-LVT method gives higher performance while
                 occupying fewer BRAMs than earlier approaches: for
                 several configurations, BRAM usage is reduced by
                 greater than 44\% and clock speed is improved by
                 greater than 76\%. The I-LVT can be used with fixed
                 ports, true ports, or the proposed switched ports
                 architectures. Formal proofs for the suggested methods,
                 resources consumption analysis, usage guidelines, and
                 analytic comparison to other methods are provided. A
                 fully parameterized Verilog implementation is released
                 as an open source library. The library has been
                 extensively tested using Altera's EDA tools.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "22",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Stitt:2016:PSW,
  author =       "Greg Stitt and Eric Schwartz and Patrick Cooke",
  title =        "A Parallel Sliding-Window Generator for
                 High-Performance Digital-Signal Processing on {FPGAs}",
  journal =      j-TRETS,
  volume =       "9",
  number =       "3",
  pages =        "23:1--23:??",
  month =        jul,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2800789",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Thu Jul 14 16:35:43 MDT 2016",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Sliding-window applications, an important class of the
                 digital-signal processing domain, are highly amenable
                 to pipeline parallelism on field-programmable gate
                 arrays (FPGAs). Although memory bandwidth often
                 restricts parallelism for many applications,
                 sliding-window applications can leverage custom
                 buffers, referred to as sliding-window generators, that
                 provide massive input bandwidth that far exceeds the
                 capabilities of external memory. Previous work has
                 introduced a variety of sliding-window generators, but
                 those approaches typically generate at most one window
                 per cycle, which significantly restricts parallelism.
                 In this article, we address this limitation with a
                 parallel sliding-window generator that can generate a
                 configurable number of windows every cycle. Although in
                 practice the number of parallel windows is limited by
                 memory bandwidth, we show that even with common
                 bandwidth limitations, the presented generator enables
                 near-linear speedups up to 16x faster than previous
                 FPGA studies that generate a single window per cycle,
                 which were already in some cases faster than
                 graphics-processing units and microprocessors.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "23",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Ul-Abdin:2016:RCF,
  author =       "Zain Ul-Abdin and Bertil Svensson",
  title =        "A Retargetable Compilation Framework for Heterogeneous
                 Reconfigurable Computing",
  journal =      j-TRETS,
  volume =       "9",
  number =       "4",
  pages =        "24:1--24:??",
  month =        sep,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2843946",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Mon Apr 3 11:34:08 MDT 2017",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "The future trend in microprocessors for the more
                 advanced embedded systems is focusing on massively
                 parallel reconfigurable architectures, consisting of
                 heterogeneous ensembles of hundreds of processing
                 elements communicating over a reconfigurable
                 interconnection network. However, the mastering of
                 low-level microarchitectural details involved in the
                 programming of such massively parallel platforms
                 becomes too cumbersome, which limits their adoption in
                 many applications. Thus, there is a dire need for an
                 approach to produce high-performance scalable
                 implementations that harness the computational
                 resources of the emerging reconfigurable platforms.
                 This article addresses the grand challenge of
                 accessibility of these diverse reconfigurable platforms
                 by suggesting the use of a high-level language,
                 occam-pi, and developing a complete design flow for
                 building, compiling, and generating machine code for
                 heterogeneous coarse-grained hardware. We have
                 evaluated the approach by implementing complex
                 industrial case studies and three common signal
                 processing algorithms. The results of the implemented
                 case studies suggest that the occam-pi language-based
                 approach, because of its well-defined semantics for
                 expressing concurrency and reconfigurability,
                 simplifies the development of applications employing
                 runtime reconfigurable devices. The associated compiler
                 framework ensures portability as well as the
                 performance benefits across heterogeneous platforms.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "24",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Ziener:2016:FBD,
  author =       "Daniel Ziener and Florian Bauer and Andreas Becher and
                 Christopher Dennl and Klaus Meyer-Wegener and Ute
                 Sch{\"u}rfeld and J{\"u}rgen Teich and J{\"o}rg-Stephan
                 Vogt and Helmut Weber",
  title =        "{FPGA}-Based Dynamically Reconfigurable {SQL} Query
                 Processing",
  journal =      j-TRETS,
  volume =       "9",
  number =       "4",
  pages =        "25:1--25:??",
  month =        sep,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2845087",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Mon Apr 3 11:34:08 MDT 2017",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "In this article, we propose an FPGA-based SQL query
                 processing approach exploiting the capabilities of
                 partial dynamic reconfiguration of modern FPGAs. After
                 the analysis of an incoming query, a query-specific
                 hardware processing unit is generated on the fly and
                 loaded on the FPGA for immediate query execution. For
                 each query, a specialized hardware accelerator pipeline
                 is composed and configured on the FPGA from a set of
                 presynthesized hardware modules. These partially
                 reconfigurable hardware modules are gathered in a
                 library covering all major SQL operations like
                 restrictions and aggregations, as well as more complex
                 operations such as joins and sorts. Moreover, this
                 holistic query processing approach in hardware supports
                 different data processing strategies including row- as
                 column-wise data processing in order to optimize data
                 communication and processing. This article gives an
                 overview of the proposed query processing methodology
                 and the corresponding library of modules. Additionally,
                 a performance analysis is introduced that is able to
                 estimate the processing time of a query for different
                 processing strategies and different communication and
                 processing architecture configurations. With the help
                 of this performance analysis, architectural bottlenecks
                 may be exposed and future optimized architectures,
                 besides the two prototypes presented here, may be
                 determined.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "25",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Matthews:2016:SMM,
  author =       "Eric Matthews and Lesley Shannon and Alexandra
                 Fedorova",
  title =        "Shared Memory Multicore {MicroBlaze} System with {SMP}
                 {Linux} Support",
  journal =      j-TRETS,
  volume =       "9",
  number =       "4",
  pages =        "26:1--26:??",
  month =        sep,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2870638",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Mon Apr 3 11:34:08 MDT 2017",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "In this work, we present PolyBlaze, a scalable and
                 configurable multicore platform for FPGA-based embedded
                 systems and systems research. PolyBlaze is an extension
                 of the MicroBlaze soft processor, leveraging the
                 configurability of the MicroBlaze and bringing it into
                 the multicore era with Linux Symmetric Multi-Processor
                 (SMP) support. This work details the hardware
                 modifications required for the MicroBlaze processor and
                 its software stack to enable fully validated SMP
                 operations, including atomic operation support, shared
                 interrupts and timers, and exception handling. New in
                 this work, we present a scalable and flexible memory
                 hierarchy optimized for Field Programmable Gate Arrays
                 (FPGAs), which manages atomic operations and provides
                 support for future flexible memory hierarchies and
                 heterogeneous systems. Also new is an in-depth analysis
                 of key performance characteristics, including memory
                 bandwidth, latency, and resource usage. For all system
                 configurations, bandwidth is found to scale linearly
                 with the addition of processor cores until the memory
                 interface is saturated. Additionally, average memory
                 latency remains constant until the memory interface is
                 saturated; after which, it scales linearly with each
                 additional processor core.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "26",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Yu:2016:OAH,
  author =       "Ting Yu and Chris Bradley and Oliver Sinnen",
  title =        "{ODoST}: Automatic Hardware Acceleration for
                 Biomedical Model Integration",
  journal =      j-TRETS,
  volume =       "9",
  number =       "4",
  pages =        "27:1--27:??",
  month =        sep,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2870639",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Mon Apr 3 11:34:08 MDT 2017",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Dynamic biomedical systems are mathematically
                 described by Ordinary Differential Equations (ODEs) and
                 their solution is often one of the most computationally
                 intensive parts in biomedical simulations. With high
                 inherent parallelism, hardware acceleration based on
                 Field-Programmable Gate Arrays (FPGAs) has great
                 potential to increase the computational performance of
                 the model simulations, while being very
                 power-efficient. However, the manual hardware
                 implementation is complex and time consuming. The
                 advantages of FPGA designs can only be realised if
                 there is a general solution to automate the process. In
                 this article, we propose a domain-specific high-level
                 synthesis tool called ODoST that automatically
                 generates an FPGA-based Hardware Accelerator Module
                 (HAM) from a high-level description. In this direct
                 approach, ODE equations are directly mapped to
                 processing pipelines without any intermediate
                 architecture layer of processing elements. We evaluate
                 the generated HAMs on real hardware based on their
                 resource usage, processing speed, and power
                 consumption, and compare them with CPUs and a GPU. The
                 results show that FPGA implementations can achieve 15.3
                 times more speedup compared to a single core CPU
                 solution and perform similarly to an auto-generated GPU
                 solution, while the FPGA implementations can achieve
                 14.5 times more power efficiency than the CPU and 3.1
                 times compared to the optimised GPU solution. Improved
                 speedups are foreseeable based on further
                 optimisations.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "27",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Chen:2016:I,
  author =       "Deming Chen",
  title =        "Introduction",
  journal =      j-TRETS,
  volume =       "9",
  number =       "4",
  pages =        "28:1--28:??",
  month =        sep,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2955103",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Mon Apr 3 11:34:08 MDT 2017",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "28",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Wegley:2016:ASD,
  author =       "Evan Wegley and Yanhua Yi and Qinhai Zhang",
  title =        "Application of Specific Delay Window Routing for
                 Timing Optimization in {FPGA} Designs",
  journal =      j-TRETS,
  volume =       "9",
  number =       "4",
  pages =        "29:1--29:??",
  month =        sep,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2892640",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Mon Apr 3 11:34:08 MDT 2017",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "In addition to optimizing for long-path timing and
                 routability, commercial FPGA routing engines must also
                 optimize for various timing constraints, enabling users
                 to fine tune their designs. These timing constraints
                 involve both long- and short-path timing requirements.
                 The intricacies of commercial FPGA architectures add
                 difficulty to the problem of supporting such
                 constraints. In this work, we introduce specific delay
                 window routing as a general method for optimization
                 during the routing stage of the FPGA design flow, which
                 can be applied to various timing constraints
                 constituting both long- and short-path requirements.
                 Furthermore, we propose a key adjustment to standard
                 FPGA routing technology for the purposes of specific
                 delay window routing. By using dual-wave expansion
                 instead of traditional single-wave expansion, we solve
                 the critical issue of inaccurate delay estimation in
                 our wave search, which would otherwise make routing
                 according to a specific delay window difficult. Our
                 results show that this dual-wave method can support
                 stricter timing constraints than the standard
                 single-wave method. For a suite of designs with
                 constraints requiring connections to meet a target
                 delay within 250ps, our dual-wave method could satisfy
                 the requirement for all designs, whereas the
                 single-wave method failed for more than two thirds of
                 the designs.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "29",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Kadric:2016:IPM,
  author =       "Edin Kadric and David Lakata and Andr{\'e} Dehon",
  title =        "Impact of Parallelism and Memory Architecture on
                 {FPGA} Communication Energy",
  journal =      j-TRETS,
  volume =       "9",
  number =       "4",
  pages =        "30:1--30:??",
  month =        sep,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2857057",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Mon Apr 3 11:34:08 MDT 2017",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "The energy in FPGA computations is dominated by data
                 communication energy, either in the form of memory
                 references or data movement on interconnect. In this
                 article, we explore how to use data placement and
                 parallelism to reduce communication energy. We show
                 that parallelism can reduce energy and that the optimal
                 level of parallelism increases with the problem size.
                 We further explore how FPGA memory architecture (memory
                 block size(s), memory banking, and spacing between
                 memory banks) can impact communication energy, and
                 determine how to organize the memory architecture to
                 guarantee that the energy overhead compared to the
                 optimally matched architecture for the design is never
                 more than 60\%. We specifically show that an
                 architecture with 32 bit wide, 16Kb internally banked
                 memories placed every 8 columns of 10 4-LUT logic
                 blocks is within 61\% of the optimally matched
                 architecture across the VTR 7 benchmark set and a set
                 of parallelism-tunable benchmarks. Without internal
                 banking, the worst-case overhead is 98\%, achieved with
                 an architecture with 32 bit wide, 8Kb memories placed
                 every 9 columns, roughly comparable to the memory
                 organization on the Cyclone V (where memories are
                 placed about every 10 columns). Monolithic 32 bit wide,
                 16Kb memories placed every 10 columns (comparable to
                 18Kb and 20Kb memories used in Virtex 4 and Stratix V
                 FPGAs) have a 180\% worst-case energy overhead.
                 Furthermore, we show practical cases where designs
                 mapped for optimal parallelism use $ 4.7 \times $ less
                 energy than designs using a single processing
                 element.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "30",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Rodionov:2016:FGI,
  author =       "Alex Rodionov and David Biancolin and Jonathan Rose",
  title =        "Fine-Grained Interconnect Synthesis",
  journal =      j-TRETS,
  volume =       "9",
  number =       "4",
  pages =        "31:1--31:??",
  month =        sep,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2892641",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Mon Apr 3 11:34:08 MDT 2017",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "One of the key challenges for the FPGA industry going
                 forward is to make the task of designing hardware
                 easier. A significant portion of that design task is
                 the creation of the interconnect pathways between
                 functional structures. We present a synthesis tool that
                 automates this process and focuses on the interconnect
                 needs in the fine-grained (sub-IP-block) design space.
                 Here there are several issues that prior research and
                 tools do not address well: the need to have fixed,
                 deterministic latency between communicating units (to
                 enable high-performance local communication without the
                 area overheads of latency insensitivity), and the
                 ability to avoid generating unnecessary arbitration
                 hardware when the application design can avoid it.
                 Using a design example, our tool generates interconnect
                 that requires 69\% fewer lines of specification code
                 than a handwritten Verilog implementation, which is a
                 32\% overall reduction for the entire application. The
                 resulting system, while requiring 6\% more total
                 functional and interconnect area, achieves the same
                 performance. We also show a quantitative and
                 qualitative advantages against an existing commercial
                 interconnect synthesis tool, over which we achieve a
                 25\% performance advantage and 15\%/57\% logic/memory
                 area savings.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "31",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Wulf:2016:FEO,
  author =       "Nicholas Wulf and Alan D. George and Ann Gordon-Ross",
  title =        "A Framework for Evaluating and Optimizing {FPGA}-Based
                 {SoCs} for Aerospace Computing",
  journal =      j-TRETS,
  volume =       "10",
  number =       "1",
  pages =        "1:1--1:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2888400",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Mon Apr 3 11:34:09 MDT 2017",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "On-board processing systems are often deployed in
                 harsh aerospace environments and must therefore adhere
                 to stringent constraints such as low power, small size,
                 and high dependability in the presence of faults.
                 Field-programmable gate arrays (FPGAs) are often an
                 attractive option for designers seeking low-power,
                 high-performance devices. However, unlike
                 nonreconfigurable devices, radiation effects can alter
                 an FPGA's functionality instead of just the device's
                 data, requiring designers to consider fault-tolerant
                 strategies to mitigate these effects. In this article,
                 we present a framework to ease these system design
                 challenges and aid designers in considering a broad
                 range of devices and fault-tolerant strategies for
                 on-board processing, highlighting the most promising
                 options and tradeoffs early in the design process. This
                 article focuses on the power, dependability, and
                 lifetime evaluation metrics, which our framework
                 calculates and leverages to evaluate the effectiveness
                 of varying system-on-chip (SoC) designs. Finally, we
                 use our framework to evaluate SoC designs for a case
                 study on a hyperspectral-imaging (HSI) mission to
                 demonstrate our framework's ability to identify
                 efficient and effective SoC designs.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "1",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Richardson:2016:AFR,
  author =       "Justin Richardson and Alan George and Kevin Cheng and
                 Herman Lam",
  title =        "Analysis of Fixed, Reconfigurable, and Hybrid Devices
                 with Computational, Memory, {I/O}, \&
                 Realizable-Utilization Metrics",
  journal =      j-TRETS,
  volume =       "10",
  number =       "1",
  pages =        "2:1--2:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2888401",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Mon Apr 3 11:34:09 MDT 2017",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "The modern processor landscape is a varied and diverse
                 community. As such, developers need a way to quickly
                 and fairly compare various devices for use with
                 particular applications. This article expands the
                 authors' previously published computational-density
                 metrics and presents an analysis of a new generation of
                 various device architectures, including CPU, DSP, FPGA,
                 GPU, and hybrid architectures. Also, new memory metrics
                 are added to expand the existing suite of metrics to
                 characterize the memory resources on various processing
                 devices. Finally, a new relational metric, realizable
                 utilization (RU), is introduced, which quantifies the
                 fraction of the computational density metric that an
                 application achieves within an individual
                 implementation. The RU metric can be used to provide
                 valuable feedback to application developers and
                 architecture designers by highlighting the upper bound
                 on specific application optimization and providing a
                 quantifiable measure of theoretical and realizable
                 performance. Overall, the analysis in this article
                 quantifies the performance tradeoffs among the
                 architectures studied, the memory characteristics of
                 different device types, and the efficiency of device
                 architectures.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "2",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Chao:2016:DTM,
  author =       "Hung-Lin Chao and Sheng-Ya Tung and Pao-Ann Hsiung",
  title =        "Dynamic Task Mapping with Congestion Speculation for
                 Reconfigurable Network-on-Chip",
  journal =      j-TRETS,
  volume =       "10",
  number =       "1",
  pages =        "3:1--3:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2892633",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Mon Apr 3 11:34:09 MDT 2017",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Network-on-Chip (NoC) has been proposed as a promising
                 communication architecture to replace the dedicated
                 interconnections and shared buses for future embedded
                 system platforms. In such a parallel platform, mapping
                 application tasks to the NoC is a key issue because it
                 affects throughput significantly due to the problem of
                 communication congestion. Increased communication
                 latency, low system performance, and low resource
                 utilization are some side-effects of a bad mapping.
                 Current mapping algorithms either do not consider link
                 utilizations or consider only the current utilizations.
                 Besides, to design an efficient NoC platform, mapping
                 task to computation nodes and scheduling communication
                 should be taken into consideration. In this work, we
                 propose an efficient algorithm for dynamic task mapping
                 with congestion speculation (DTMCS) that not only
                 includes the conventional application mapping, but also
                 further considers future traffic patterns based on the
                 link utilization. The proposed algorithm can reduce
                 overall congestion, instead of only improving the
                 current packet blocking situation. Our experiment
                 results have demonstrated that compared to the
                 state-of-the-art congestion-aware Path Load algorithm,
                 the proposed DTMCS algorithm can reduce up to 40.5\% of
                 average communication latency, while the maximal
                 communication latency can be reduced by up to 67.7\%.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "3",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{LeGal:2016:FSM,
  author =       "Bertrand {Le Gal} and Y{\'e}rom-David Bromberg and
                 Laurent R{\'e}veill{\`e}re and Jigar Solanki",
  title =        "A Flexible {SoC} and Its Methodology for Parser-Based
                 Applications",
  journal =      j-TRETS,
  volume =       "10",
  number =       "1",
  pages =        "4:1--4:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2939379",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Mon Apr 3 11:34:09 MDT 2017",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Embedded systems are being increasingly network
                 interconnected. They are required to interact with
                 their environment through text-based protocol messages.
                 Parsing such messages is control dominated. The work
                 presented in this article attempts to accelerate
                 message parsers using a codesign-based approach. We
                 propose a generic architecture associated with an
                 automated design methodology that enables SoC/SoPC
                 system generation from high-level specifications of
                 message protocols. Experimental results obtained on a
                 Xilinx ML605 board show acceleration factors ranging
                 from four to 11. Both static and dynamic
                 reconfigurations of coprocessors are discussed and then
                 evaluated so as to reduce the system hardware
                 complexity.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "4",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Pang:2016:MKR,
  author =       "Yeyong Pang and Shaojun Wang and Yu Peng and Xiyuan
                 Peng and Nicholas J. Fraser and Philip H. W. Leong",
  title =        "A Microcoded Kernel Recursive Least Squares Processor
                 Using {FPGA} Technology",
  journal =      j-TRETS,
  volume =       "10",
  number =       "1",
  pages =        "5:1--5:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2950061",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Mon Apr 3 11:34:09 MDT 2017",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Kernel methods utilize linear methods in a nonlinear
                 feature space and combine the advantages of both.
                 Online kernel methods, such as kernel recursive least
                 squares (KRLS) and kernel normalized least mean squares
                 (KNLMS), perform nonlinear regression in a recursive
                 manner, with similar computational requirements to
                 linear techniques. In this article, an architecture for
                 a microcoded kernel method accelerator is described,
                 and high-performance implementations of sliding-window
                 KRLS, fixed-budget KRLS, and KNLMS are presented. The
                 architecture utilizes pipelining and vectorization for
                 performance, and microcoding for reusability. The
                 design can be scaled to allow tradeoffs between
                 capacity, performance, and area. The design is compared
                 with a central processing unit (CPU), digital signal
                 processor (DSP), and Altera OpenCL implementations. In
                 different configurations on an Altera Arria 10 device,
                 our SW-KRLS implementation delivers floating-point
                 throughput of approximately 16 GFLOPs, latency of 5.5 $
                 \mu $ s, and energy consumption of $ 10^{- 4} $ J,
                 these being improvements over a CPU by factors of 12,
                 17, and 24, respectively.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "5",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Tang:2016:AKM,
  author =       "Qing Y. Tang and Mohammed A. S. Khalid",
  title =        "Acceleration of $k$-Means Algorithm Using {Altera SDK}
                 for {OpenCL}",
  journal =      j-TRETS,
  volume =       "10",
  number =       "1",
  pages =        "6:1--6:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2964910",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Mon Apr 3 11:34:09 MDT 2017",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "A K-means clustering algorithm involves partitioning
                 of data iteratively into k clusters. It is one of the
                 most popular data-mining algorithms [Wu et al. 2007],
                 and is widely used in other applications, such as image
                 processing and machine learning. However, k-means is
                 highly time-consuming when data or cluster size is
                 large. Traditionally, FPGAs have shown great promise
                 for accelerating computationally intensive algorithms,
                 but they are harder to use for acceleration if we rely
                 on traditional HD-based design methods. The recent
                 introduction of Altera SDK for the OpenCL high-level
                 synthesis tool allows developers to utilize FPGA's
                 potential without long development periods and
                 extensive hardware knowledge. This article presents an
                 optimized implementation of a k-means clustering
                 algorithm on an FPGA using Altera SDK for OpenCL.
                 Performance and power consumption is measured with
                 various data, cluster, and dimension sizes. When
                 compared to state-of-the-art solutions, this
                 implementation supports larger cluster sizes, offers up
                 to 21x speed over a CPU and is more power efficient
                 than a GPU. Unlike previous implementations, it can
                 deliver consistently high throughput across large or
                 small feature dimensions given reasonable cluster sizes
                 and large enough data size.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "6",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Wong:2016:MCM,
  author =       "Henry Wong and Vaughn Betz and Jonathan Rose",
  title =        "Microarchitecture and Circuits for a {200 MHz}
                 Out-of-Order Soft Processor Memory System",
  journal =      j-TRETS,
  volume =       "10",
  number =       "1",
  pages =        "7:1--7:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2974022",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Mon Apr 3 11:34:09 MDT 2017",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Although FPGAs have grown in capacity, FPGA-based soft
                 processors have grown very little because of the
                 difficulty of achieving higher performance in exchange
                 for area. Superscalar out-of-order processors promise
                 large performance gains, and the memory subsystem is a
                 key part of such a processor that must help supply
                 increased performance. In this article, we describe and
                 explore microarchitectural and circuit-level tradeoffs
                 in the design of such a memory system. We show the
                 significant instructions-per-cycle wins for providing
                 various levels of out-of-order memory access and memory
                 dependence speculation ($ 1.32 \times $ SPECint2000)
                 and for the addition of a second-level cache (another $
                 1.60 \times $ ). With careful microarchitecture and
                 circuit design, we also achieve a L1 translation
                 lookaside buffers and cache lookup with 29\% less logic
                 delay than the simpler Nios II/f memory system.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "7",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Rouhani:2016:ART,
  author =       "Bita Darvish Rouhani and Azalia Mirhoseini and Ebrahim
                 M. Songhori and Farinaz Koushanfar",
  title =        "Automated Real-Time Analysis of Streaming Big and
                 Dense Data on Reconfigurable Platforms",
  journal =      j-TRETS,
  volume =       "10",
  number =       "1",
  pages =        "8:1--8:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2974023",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Mon Apr 3 11:34:09 MDT 2017",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "We propose SSketch, a novel automated framework for
                 efficient analysis of dynamic big data with dense
                 (non-sparse) correlation matrices on reconfigurable
                 platforms. SSketch targets streaming applications where
                 each data sample can be processed only once and storage
                 is severely limited. Our framework adaptively learns
                 from the stream of input data and updates a
                 corresponding ensemble of lower-dimensional data
                 structures, a.k.a., a sketch matrix. A new sketching
                 methodology is introduced that tailors the problem of
                 transforming the big data with dense correlations to an
                 ensemble of lower-dimensional subspaces such that it is
                 suitable for hardware-based acceleration performed by
                 reconfigurable hardware. The new method is scalable,
                 while it significantly reduces costly memory
                 interactions and enhances matrix computation
                 performance by leveraging coarse-grained parallelism
                 existing in the dataset. SSketch provides an automated
                 optimization methodology for creating the most accurate
                 data sketch for a given set of user-defined
                 constraints, including runtime and power as well as
                 platform constraints such as memory. To facilitate
                 automation, SSketch takes advantage of a
                 Hardware/Software (HW/SW) co-design approach: It
                 provides an Application Programming Interface that can
                 be customized for rapid prototyping of an arbitrary
                 matrix-based data analysis algorithm. Proof-of-concept
                 evaluations on a variety of visual datasets with more
                 than 11 million non-zeros demonstrate up to a 200-fold
                 speedup on our hardware-accelerated realization of
                 SSketch compared to a software-based deployment on a
                 general-purpose processor.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "8",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Bourge:2016:GEC,
  author =       "Alban Bourge and Olivier Muller and Fr{\'e}d{\'e}ric
                 Rousseau",
  title =        "Generating Efficient Context-Switch Capable Circuits
                 through Autonomous Design Flow",
  journal =      j-TRETS,
  volume =       "10",
  number =       "1",
  pages =        "9:1--9:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2996199",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Mon Apr 3 11:34:09 MDT 2017",
  bibsource =    "http://portal.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Commercial off-the-shelf (COTS) Field-Programmable
                 Gate Arrays (FPGAs) are becoming increasingly powerful.
                 In addition to their huge hardware resources, they are
                 also integrated into complete systems on chips (SOCs),
                 e.g., in the latest Xilinx Zynq or Altera Stratix
                 platforms. However, cooperation between FPGAs and their
                 surroundings, and the flexibility of hardware task
                 management could still be improved. For instance,
                 mechanisms have yet to be automated to allow multi-user
                 approaches. A reconfigurable resource can be shared
                 between applications or users only if it has a
                 context-switch ability allowing applications to be
                 paused and resumed in response to system demands. Here,
                 we present a high-level synthesis (HLS) design flow
                 producing a context-switch-capable circuit. The design
                 flow manipulates the intermediate representation of an
                 HLS tool to build the context extraction mechanism and
                 to optimize performance for the circuit produced. The
                 method is based on efficient checkpoint selection and
                 insertion of a powerful scan-chain into the initial
                 circuit. This scan-chain can extract flip-flops or
                 memory content. Experiments with the system produced
                 show that it has a low hardware overhead for many
                 benchmark applications, and that the hardware added has
                 a negligible impact on application performance.
                 Comparisons with current standard methods highlight the
                 efficiency of our contributions.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "9",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Cardoso:2017:ISS,
  author =       "Jo{\~a}o M. P. Cardoso and Cristina Silvano",
  title =        "Introduction to the Special Section on {FPL 2015}",
  journal =      j-TRETS,
  volume =       "10",
  number =       "2",
  pages =        "10:1--10:??",
  month =        apr,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3041224",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Dec 23 10:23:01 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "10",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Kim:2017:SSC,
  author =       "Jin Hee Kim and Jason H. Anderson",
  title =        "Synthesizable Standard Cell {FPGA} Fabrics Targetable
                 by the {Verilog}-to-Routing {CAD} Flow",
  journal =      j-TRETS,
  volume =       "10",
  number =       "2",
  pages =        "11:1--11:??",
  month =        apr,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3024063",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Dec 23 10:23:01 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "In this article, we consider implementing
                 field-programmable gate arrays (FPGAs) using a standard
                 cell design methodology and present a framework for the
                 automated generation of synthesizable FPGA fabrics. The
                 open-source Verilog-to-Routing (VTR) FPGA architecture
                 evaluation framework [Rose et al. 2012] is extended to
                 generate synthesizable Verilog for its in-memory FPGA
                 architectural device model. The Verilog can
                 subsequently be synthesized into standard cells, placed
                 and routed using an ASIC design flow. A second
                 extension to VTR generates a configuration bitstream
                 for the FPGA, where the bitstream configures the FPGA
                 to realize a user-provided placed and routed design.
                 The proposed framework and methodology makes possible
                 the silicon implementation of a wide range of
                 VTR-modeled FPGA fabrics. In an experimental study,
                 area and timing-optimized FPGA implementations in 65nm
                 TSMC standard cells are compared to a 65nm Altera
                 commercial FPGA. In addition, we consider augmenting
                 the generic standard-cell library from TSMC with a
                 manually designed and laid-out FPGA-specific cell. We
                 demonstrate the utility of the custom cell in reducing
                 the area of the synthesized FPGA fabric.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "11",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Burovskiy:2017:EAH,
  author =       "Pavel Burovskiy and Paul Grigoras and Spencer Sherwin
                 and Wayne Luk",
  title =        "Efficient Assembly for High-Order Unstructured {FEM}
                 Meshes {(FPL 2015)}",
  journal =      j-TRETS,
  volume =       "10",
  number =       "2",
  pages =        "12:1--12:??",
  month =        apr,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3024064",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Dec 23 10:23:01 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "The Finite Element Method (FEM) is a common numerical
                 technique used for solving Partial Differential
                 Equations on large and unstructured domain geometries.
                 Numerical methods for FEM typically use algorithms and
                 data structures which exhibit an unstructured memory
                 access pattern. This makes acceleration of FEM on
                 Field-Programmable Gate Arrays using an efficient,
                 deeply pipelined architecture particularly challenging.
                 In this work, we focus on implementing and optimising a
                 vector assembly operation which, in the context of FEM,
                 induces the unstructured memory access. We propose a
                 dataflow architecture, graph-based theoretical model,
                 and design flow for optimising the assembly operation
                 for spectral/hp finite element method on reconfigurable
                 accelerators. We evaluate the proposed approach on two
                 benchmark meshes and show that the graph-theoretic
                 method of generating a static data access schedule
                 results in a significant improvement in resource
                 utilisation compared to prior work. This enables
                 supporting larger FEM meshes on FPGA than previously
                 possible.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "12",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Yang:2017:FSA,
  author =       "Hsin-Jung Yang and Kermin Fleming and Felix
                 Winterstein and Michael Adler and Joel Emer",
  title =        "{(FPL 2015) Scavenger}: Automating the Construction of
                 Application-Optimized Memory Hierarchies",
  journal =      j-TRETS,
  volume =       "10",
  number =       "2",
  pages =        "13:1--13:??",
  month =        apr,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3009971",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Dec 23 10:23:01 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "High-level abstractions separate algorithm design from
                 platform implementation, allowing programmers to focus
                 on algorithms while building complex systems. This
                 separation also provides system programmers and
                 compilers an opportunity to optimize platform services
                 on an application-by-application basis. In
                 field-programmable gate arrays (FPGAs), platform-level
                 malleability extends to the memory system: Unlike
                 general-purpose processors, in which memory hardware is
                 fixed at design time, the capacity, associativity, and
                 topology of FPGA memory systems may all be tuned to
                 improve application performance. Since application
                 kernels may only explicitly use few memory resources,
                 substantial memory capacity may be available to the
                 platform for use on behalf of the user program. In this
                 work, we present Scavenger, which utilizes spare
                 resources to construct program-optimized memories, and
                 we also perform an initial exploration of methods for
                 automating the construction of these
                 application-specific memory hierarchies. Although
                 exploiting spare resources can be beneficial,
                 na{\"\i}vely consuming all memory resources may cause
                 frequency degradation. To relieve timing pressure in
                 large block RAM (BRAM) structures, we provide
                 microarchitectural techniques to trade memory latency
                 for design frequency. We demonstrate, by examining a
                 set of benchmarks, that our scalable cache
                 microarchitecture achieves performance gains of 7\% to
                 74\% (with a 26\% geometric mean on average) over the
                 baseline cache microarchitecture when scaling the size
                 of first-level caches to the maximum.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "13",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Kapre:2017:HDR,
  author =       "Nachiket Kapre and Jan Gray",
  title =        "{Hoplite}: a Deflection-Routed Directional Torus {NoC}
                 for {FPGAs}",
  journal =      j-TRETS,
  volume =       "10",
  number =       "2",
  pages =        "14:1--14:??",
  month =        apr,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3027486",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Dec 23 10:23:01 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "We can design an FPGA-optimized lightweight
                 network-on-chip (NoC) router for flit-oriented
                 packet-switched communication that is an order of
                 magnitude smaller (in terms of LUTs and FFs) than
                 state-of-the-art FPGA overlay routers available today.
                 We present Hoplite, an efficient, lightweight, and fast
                 FPGA overlay NoC that is designed to be small and
                 compact by (1) using deflection routing instead of
                 buffered switching to eliminate expensive FIFO buffers
                 and (2) using a torus topology to reduce the cost of
                 switch crossbar. Buffering and crossbar implementation
                 complexities have traditionally limited speeds and
                 imposed heavy resource costs in conventional FPGA
                 overlay NoCs. We take care to exploit the fracturable
                 lookup tables (LUT) organization of the FPGA to further
                 improve the resource efficiency of mapping the
                 expensive crossbar multiplexers. Hoplite can outperform
                 classic, bidirectional, buffered mesh networks for
                 single-flit-oriented FPGA applications by as much as $
                 1.5 \times $ (best achievable throughputs for a $ 10
                 \times 10 $ system) or $ 2.5 \times $ (allocating same
                 amount of FPGA resources to both NoCs) for uniform
                 random traffic. When compared to buffered mesh
                 switches, FPGA-based deflection routers are $ \approx
                 3.5 \times $ smaller (HLS-generated switch) and $ 2.5
                 \times $ faster (clock period) for 32b payloads. In a
                 separate experiment, we hand-crafted an RTL version of
                 our switch with location constraints that requires only
                 60 LUTs and 100 FFs per router and runs at 2.9ns. We
                 conduct additional layout experiments on modern Xilinx
                 and Altera FPGAs and demonstrate wide-channel
                 chip-spanning layouts that run in excess of 300MHz
                 while consuming 10--15\% of overall chip resources. We
                 also demonstrate a clustered RISC-V multiprocessor
                 organization that uses Hoplite to help deliver the high
                 processing throughputs of the FPGA architecture to user
                 applications.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "14",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Leong:2017:FYF,
  author =       "Philip H. W. Leong and Hideharu Amano and Jason
                 Anderson and Koen Bertels and Jo{\~a}o M. P. Cardoso
                 and Oliver Diessel and Guy Gogniat and Mike Hutton and
                 Junkyu Lee and Wayne Luk and Patrick Lysaght and Marco
                 Platzner and Viktor K. Prasanna and Tero Rissa and
                 Cristina Silvano and Hayden Kwok-Hay So and Yu Wang",
  title =        "The First 25 Years of the {FPL} Conference:
                 Significant Papers",
  journal =      j-TRETS,
  volume =       "10",
  number =       "2",
  pages =        "15:1--15:??",
  month =        apr,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2996468",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Dec 23 10:23:01 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "A summary of contributions made by significant papers
                 from the first 25 years of the Field-Programmable Logic
                 and Applications conference (FPL) is presented. The 27
                 papers chosen represent those which have most strongly
                 influenced theory and practice in the field.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "15",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Takano:2017:PSA,
  author =       "Shigeyuki Takano",
  title =        "Performance Scalability of Adaptive Processor
                 Architecture",
  journal =      j-TRETS,
  volume =       "10",
  number =       "2",
  pages =        "16:1--16:??",
  month =        apr,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3007902",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Dec 23 10:23:01 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "In this article, we evaluate the performance
                 scalability of architectures called adaptive
                 processors, which dynamically configure an
                 application-specific pipelined datapath and perform a
                 data-flow streaming execution. Previous works have
                 examined the basics of the following: (1) a
                 computational model that supports the swap-in/out of a
                 partial datapath-namely, a virtual hardware is realized
                 by hardware, without a host processor and its software;
                 (2) an architecture that has shown a minimum pipeline
                 requirement and a minimum component requirement; and
                 (3) the characteristics of the execution phase and a
                 stack shift that realizes the swap-in/out. However,
                 these works did not explore the design space,
                 particularly with respect to the following: (1) the
                 clock cycle time on the adaptive processor, which must
                 depend on a wire delay that is primarily used for the
                 global communication of requests, acknowledgments,
                 acquirements, releases, and so forth, and (2) a revised
                 control system that can handle the out-of-order
                 acknowledgment and in-order acquirement that guarantee
                 the correct datapath configuration with a conditional
                 branch for the configurations. This article explores
                 the scaling of the ALU resources versus pipelining of
                 the wires.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "16",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Liu:2017:TOF,
  author =       "Zhiqiang Liu and Yong Dou and Jingfei Jiang and Jinwei
                 Xu and Shijie Li and Yongmei Zhou and Yingnan Xu",
  title =        "Throughput-Optimized {FPGA} Accelerator for Deep
                 Convolutional Neural Networks",
  journal =      j-TRETS,
  volume =       "10",
  number =       "3",
  pages =        "17:1--17:??",
  month =        jul,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3079758",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Dec 23 10:23:02 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Deep convolutional neural networks (CNNs) have gained
                 great success in various computer vision applications.
                 State-of-the-art CNN models for large-scale
                 applications are computation intensive and memory
                 expensive and, hence, are mainly processed on
                 high-performance processors like server CPUs and GPUs.
                 However, there is an increasing demand of high-accuracy
                 or real-time object detection tasks in large-scale
                 clusters or embedded systems, which requires
                 energy-efficient accelerators because of the green
                 computation requirement or the limited battery
                 restriction. Due to the advantages of energy efficiency
                 and reconfigurability, Field-Programmable Gate Arrays
                 (FPGAs) have been widely explored as CNN accelerators.
                 In this article, we present an in-depth analysis of
                 computation complexity and the memory footprint of each
                 CNN layer type. Then a scalable parallel framework is
                 proposed that exploits four levels of parallelism in
                 hardware acceleration. We further put forward a
                 systematic design space exploration methodology to
                 search for the optimal solution that maximizes
                 accelerator throughput under the FPGA constraints such
                 as on-chip memory, computational resources, external
                 memory bandwidth, and clock frequency. Finally, we
                 demonstrate the methodology by optimizing three
                 representative CNNs (LeNet, AlexNet, and VGG-S) on a
                 Xilinx VC709 board. The average performance of the
                 three accelerators is 424.7, 445.6, and 473.4GOP/s
                 under 100MHz working frequency, which outperforms the
                 CPU and previous work significantly.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "17",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Ueno:2017:BCF,
  author =       "Tomohiro Ueno and Kentaro Sano and Satoru Yamamoto",
  title =        "Bandwidth Compression of Floating-Point Numerical Data
                 Streams for {FPGA}-Based High-Performance Computing",
  journal =      j-TRETS,
  volume =       "10",
  number =       "3",
  pages =        "18:1--18:??",
  month =        jul,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3053688",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Dec 23 10:23:02 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/fparith.bib;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Although computational performance is often limited by
                 insufficient bandwidth to/from an external memory, it
                 is not easy to physically increase off-chip memory
                 bandwidth. In this study, we propose a hardware-based
                 bandwidth compression technique that can be applied to
                 field-programmable gate array-- (FPGA) based
                 high-performance computation with a logically wider
                 effective memory bandwidth. Our proposed hardware
                 approach can boost the performance of FPGA-based stream
                 computations by applying a data compression technique
                 to effectively transfer more data streams. To apply
                 this data compression technique to bandwidth
                 compression via hardware, several requirements must
                 first be satisfied, including an acceptable level of
                 compression performance and a sufficiently small
                 hardware footprint. Our proposed hardware-based
                 bandwidth compressor utilizes an efficient
                 prediction-based data compression algorithm. Moreover,
                 we propose a multichannel serializer and deserializer
                 that enable applications to use multiple channels of
                 computational data with the bandwidth compression. The
                 serializer encodes compressed data blocks of multiple
                 channels into a data stream, which is efficiently
                 written to an external memory. Based on preliminary
                 evaluation, we define an encoding format considering
                 both high compression ratio and small hardware area. As
                 a result, we demonstrate that our area saving bandwidth
                 compressor increases performance of an FPGA-based fluid
                 dynamics simulation by deploying more processing
                 elements to exploit spatial parallelism with the
                 enhanced memory bandwidth.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "18",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Laforest:2017:MCM,
  author =       "Charles Eric Laforest and Jason H. Anderson",
  title =        "Microarchitectural Comparison of the {MXP} and
                 {Octavo} Soft-Processor {FPGA} Overlays",
  journal =      j-TRETS,
  volume =       "10",
  number =       "3",
  pages =        "19:1--19:??",
  month =        jul,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3053679",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Dec 23 10:23:02 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Field-Programmable Gate Arrays (FPGAs) can yield
                 higher performance and lower power than software
                 solutions on CPUs or GPUs. However, designing with
                 FPGAs requires specialized hardware design skills and
                 hours-long CAD processing times. To reduce and
                 accelerate the design effort, we can implement an
                 overlay architecture on the FPGA, on which we then more
                 easily construct the desired system but at a large cost
                 in performance and area relative to a direct FPGA
                 implementation. In this work, we compare the
                 micro-architecture, performance, and area of two
                 soft-processor overlays: the Octavo multi-threaded
                 soft-processor and the MXP soft vector processor. To
                 measure the area and performance penalties of these
                 overlays relative to the underlying FPGA hardware, we
                 compare direct FPGA implementations of the
                 micro-benchmarks written in C synthesized with the
                 LegUp HLS tool and also written in the Verilog HDL.
                 Overall, Octavo's higher operating frequency and MXP's
                 more efficient code execution results in similar
                 performance from both, within an order of magnitude of
                 direct FPGA implementations, but with a penalty of an
                 order of magnitude greater area.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "19",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Gu:2017:IRF,
  author =       "Chongyan Gu and Neil Hanley and M{\'a}ire O'neill",
  title =        "Improved Reliability of {FPGA}-Based {PUF}
                 Identification Generator Design",
  journal =      j-TRETS,
  volume =       "10",
  number =       "3",
  pages =        "20:1--20:??",
  month =        jul,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3053681",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Dec 23 10:23:02 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Physical unclonable functions (PUFs), a form of
                 physical security primitive, enable digital identifiers
                 to be extracted from devices, such as field
                 programmable gate arrays (FPGAs). Many PUF
                 implementations have been proposed to generate these
                 unique n -bit binary strings. However, they often offer
                 insufficient uniqueness and reliability when
                 implemented on FPGAs and can consume excessive
                 resources. To address these problems, in this article
                 we present an efficient, lightweight, and scalable PUF
                 identification (ID) generator circuit that offers a
                 compact design with good uniqueness and reliability
                 properties and is specifically designed for FPGAs. A
                 novel post-characterisation methodology is also
                 proposed that improves the reliability of a PUF without
                 the need for any additional hardware resources.
                 Moreover, the proposed post-characterisation method can
                 be generally used for any FPGA-based PUF designs. The
                 PUF ID generator consumes 8.95\% of the hardware
                 resources of a low-cost Xilinx Spartan-6 LX9 FPGA and
                 0.81\% of a Xilinx Artix-7 FPGA. Experimental results
                 show good uniqueness, reliability, and uniformity with
                 no occurrence of bit-aliasing. In particular, the
                 reliability of the PUF is close to 100\% over an
                 environmental temperature range of 25${}^\circ $C to
                 70${}^\circ $C with \pm 10\% variation in the supply
                 voltage.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "20",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Prost-Boucle:2017:EVF,
  author =       "Adrien Prost-Boucle and Fr{\'e}d{\'e}ric P{\'e}trot
                 and Vincent Leroy and Hande Alemdar",
  title =        "Efficient and Versatile {FPGA} Acceleration of Support
                 Counting for Stream Mining of Sequences and Frequent
                 Itemsets",
  journal =      j-TRETS,
  volume =       "10",
  number =       "3",
  pages =        "21:1--21:??",
  month =        jul,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3027485",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Dec 23 10:23:02 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Stream processing has become extremely popular for
                 analyzing huge volumes of data for a variety of
                 applications, including IoT, social networks, retail,
                 and software logs analysis. Streams of data are
                 produced continuously and are mined to extract patterns
                 characterizing the data. A class of data mining
                 algorithm, called generate-and-test, produces a set of
                 candidate patterns that are then evaluated over data.
                 The main challenges of these algorithms are to achieve
                 high throughput, low latency, and reduced power
                 consumption. In this article, we present a novel
                 power-efficient, fast, and versatile hardware
                 architecture whose objective is to monitor a set of
                 target patterns to maintain their frequency over a
                 stream of data. This accelerator can be used to
                 accelerate data-mining algorithms, including itemsets
                 and sequences mining. The massive fine-grain
                 reconfiguration capability of field-programmable gate
                 array (FPGA) technologies is ideal to implement the
                 high number of pattern-detection units needed for these
                 intensive data-mining applications. We have thus
                 designed and implemented an IP that features
                 high-density FPGA occupation and high working
                 frequency. We provide detailed description of the IP
                 internal micro-architecture and its actual
                 implementation and optimization for the targeted FPGA
                 resources. We validate our architecture by developing a
                 co-designed implementation of the Apriori Frequent
                 Itemset Mining (FIM) algorithm, and perform numerous
                 experiments against existing hardware and software
                 solutions. We demonstrate that FIM hardware
                 acceleration is particularly efficient for large and
                 low-density datasets (i.e., long-tailed datasets). Our
                 IP reaches a data throughput of 250 million items/s and
                 monitors up to 11.6k patterns simultaneously, on a
                 prototyping board that overall consumes 24W in the
                 worst case. Furthermore, our hardware accelerator
                 remains generic and can be integrated to other generate
                 and test algorithms.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "21",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Tili:2017:RPG,
  author =       "Ilian Tili and Kalin Ovtcharov and J. Gregory
                 Steffan",
  title =        "Reducing the Performance Gap between Soft Scalar
                 {CPUs} and Custom Hardware with {TILT}",
  journal =      j-TRETS,
  volume =       "10",
  number =       "3",
  pages =        "22:1--22:??",
  month =        jul,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3079757",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Dec 23 10:23:02 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "By using resource sharing field-programmable gate
                 array (FPGA) compute engines, we can reduce the
                 performance gap between soft scalar CPUs and
                 resource-intensive custom datapath designs. This
                 article demonstrates that Thread- and Instruction-Level
                 parallel Template architecture (TILT), a programmable
                 FPGA-based horizontally microcoded compute engine
                 designed to highly utilize floating point (FP)
                 functional units (FUs), can improve significantly the
                 average throughput of eight FP-intensive applications
                 compared to a soft scalar CPU (similar to a FP-extended
                 Nios). For eight benchmark applications, we show that:
                 (i) a base TILT configuration having a single instance
                 for each FU type can improve the performance over a
                 soft scalar CPU by 15.8 $ \times $ , while requiring on
                 average 26\% of the custom datapaths' area; (ii)
                 selectively increasing the number of FUs can more than
                 double TILT's average throughput, reducing the
                 custom-datapath-throughput-gap from 576 $ \times $ to
                 14 $ \times $ ; and (iii) replicated instances of the
                 most computationally dense TILT configuration that fit
                 within the area of each custom datapath design can
                 reduce the gap to 8.27 $ \times $ , while replicated
                 instances of application-tuned configurations of TILT
                 can reduce the custom-datapath-throughput-gap to an
                 average of 5.22 $ \times $ , and up to 3.41 $ \times $
                 for the Matrix Multiply benchmark. Last, we present
                 methods for design space reduction, and we correctly
                 predict the computationally densest design for seven
                 out of eight benchmarks.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "22",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Wulf:2017:OFP,
  author =       "Nicholas Wulf and Alan D. George and Ann Gordon-Ross",
  title =        "Optimizing {FPGA} Performance, Power, and
                 Dependability with Linear Programming",
  journal =      j-TRETS,
  volume =       "10",
  number =       "3",
  pages =        "23:1--23:??",
  month =        jul,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3079756",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Dec 23 10:23:02 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Field-programmable gate arrays (FPGA) are an
                 increasingly attractive alternative to traditional
                 microprocessor-based computing architectures in
                 extreme-computing domains, such as aerospace and
                 supercomputing. FPGAs offer several resource types that
                 offer different tradeoffs between speed, power, and
                 area, which make FPGAs highly flexible for varying
                 application computational requirements. However, since
                 an application's computational operations can map to
                 different resource types, a major challenge in
                 leveraging resource-diverse FPGAs is determining the
                 optimal distribution of these operations across the
                 device's available resources for varying FPGA devices,
                 resulting in an extremely large design space. In order
                 to facilitate fast design-space exploration, this
                 article presents a method based on linear programming
                 (LP) that determines the optimal operation distribution
                 for a particular device and application with respect to
                 performance, power, or dependability metrics. Our LP
                 method is an effective tool for exploring early designs
                 by quickly analyzing thousands of FPGAs to determine
                 the best FPGA devices and operation distributions,
                 which significantly reduces design time. We demonstrate
                 our LP method's effectiveness with two case studies
                 involving dot-product and distance-calculation kernels
                 on a range of Virtex-5 FPGAs. Results show that our LP
                 method selects optimal distributions of operations to
                 within an average of 4\% of actual values.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "23",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Riebler:2017:EBB,
  author =       "Heinrich Riebler and Michael Lass and Robert
                 Mittendorf and Thomas L{\"o}cke and Christian Plessl",
  title =        "Efficient Branch and Bound on {FPGAs} Using Work
                 Stealing and Instance-Specific Designs",
  journal =      j-TRETS,
  volume =       "10",
  number =       "3",
  pages =        "24:1--24:??",
  month =        jul,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3053687",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Dec 23 10:23:02 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Branch and bound (B8B) algorithms structure the search
                 space as a tree and eliminate infeasible solutions
                 early by pruning subtrees that cannot lead to a valid
                 or optimal solution. Custom hardware designs
                 significantly accelerate the execution of these
                 algorithms. In this article, we demonstrate a
                 high-performance B8B implementation on FPGAs. First, we
                 identify general elements of B8B algorithms and
                 describe their implementation as a finite state
                 machine. Then, we introduce workers that autonomously
                 cooperate using work stealing to allow parallel
                 execution and full utilization of the target FPGA.
                 Finally, we explore advantages of instance-specific
                 designs that target a specific problem instance to
                 improve performance. We evaluate our concepts by
                 applying them to a branch and bound problem, the
                 reconstruction of corrupted AES keys obtained from
                 cold-boot attacks. The evaluation shows that our work
                 stealing approach is scalable with the available
                 resources and provides speedups proportional to the
                 number of workers. Instance-specific designs allow us
                 to achieve an overall speedup of 47 $ \times $ compared
                 to the fastest implementation of AES key reconstruction
                 so far. Finally, we demonstrate how instance-specific
                 designs can be generated just-in-time such that the
                 provided speedups outweigh the additional time required
                 for design synthesis.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "24",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Gerlein:2017:NCA,
  author =       "Eduardo A. Gerlein and T. M. Mcginnity and Ammar
                 Belatreche and Sonya Coleman",
  title =        "Network on Chip Architecture for Multi-Agent Systems
                 in {FPGA}",
  journal =      j-TRETS,
  volume =       "10",
  number =       "4",
  pages =        "25:1--25:??",
  month =        dec,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3121112",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Fri Dec 29 07:28:53 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "A system of interacting agents is, by definition, very
                 demanding in terms of computational resources. Although
                 multi-agent systems have been used to solve complex
                 problems in many areas, it is usually very difficult to
                 perform large-scale simulations in their targeted
                 serial computing platforms. Reconfigurable hardware, in
                 particular Field Programmable Gate Arrays devices, have
                 been successfully used in High Performance Computing
                 applications due to their inherent flexibility, data
                 parallelism, and algorithm acceleration capabilities.
                 Indeed, reconfigurable hardware seems to be the next
                 logical step in the agency paradigm, but only a few
                 attempts have been successful in implementing
                 multi-agent systems in these platforms. This article
                 discusses the problem of inter-agent communications in
                 Field Programmable Gate Arrays. It proposes a
                 Network-on-Chip in a hierarchical star topology to
                 enable agents' transactions through message
                 broadcasting using the Open Core Protocol as an
                 interface between hardware modules. A customizable
                 router microarchitecture is described and a multi-agent
                 system is created to simulate and analyse message
                 exchanges in a generic heavy traffic load agent-based
                 application. Experiments have shown a throughput of
                 1.6Gbps per port at 100MHz without packet loss and
                 seamless scalability characteristics.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "25",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Fraser:2017:FIK,
  author =       "Nicholas J. Fraser and Junkyu Lee and Duncan J. M.
                 Moss and Julian Faraone and Stephen Tridgell and Craig
                 T. Jin and Philip H. W. Leong",
  title =        "{FPGA} Implementations of Kernel Normalised Least Mean
                 Squares Processors",
  journal =      j-TRETS,
  volume =       "10",
  number =       "4",
  pages =        "26:1--26:??",
  month =        dec,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3106744",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Fri Dec 29 07:28:53 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Kernel adaptive filters (KAFs) are online machine
                 learning algorithms which are amenable to highly
                 efficient streaming implementations. They require only
                 a single pass through the data and can act as universal
                 approximators, i.e. approximate any continuous function
                 with arbitrary accuracy. KAFs are members of a family
                 of kernel methods which apply an implicit non-linear
                 mapping of input data to a high dimensional feature
                 space, permitting learning algorithms to be expressed
                 entirely as inner products. Such an approach avoids
                 explicit projection into the feature space, enabling
                 computational efficiency. In this paper, we propose the
                 first fully pipelined implementation of the kernel
                 normalised least mean squares algorithm for regression.
                 Independent training tasks necessary for hyperparameter
                 optimisation fill pipeline stages, so no stall cycles
                 to resolve dependencies are required. Together with
                 other optimisations to reduce resource utilisation and
                 latency, our core achieves 161 GFLOPS on a Virtex 7
                 XC7VX485T FPGA for a floating point implementation and
                 211 GOPS for fixed point. Our PCI Express based
                 floating-point system implementation achieves 80\% of
                 the core's speed, this being a speedup of 10$ \times $
                 over an optimised implementation on a desktop processor
                 and 2.66$ \times $ over a GPU.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "26",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Chu:2017:FCA,
  author =       "Thiem Van Chu and Shimpei Sato and Kenji Kise",
  title =        "Fast and Cycle-Accurate Emulation of Large-Scale
                 Networks-on-Chip Using a Single {FPGA}",
  journal =      j-TRETS,
  volume =       "10",
  number =       "4",
  pages =        "27:1--27:??",
  month =        dec,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3151758",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Fri Dec 29 07:28:53 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Modeling and simulation/emulation play a major role in
                 research and development of novel Networks-on-Chip
                 (NoCs). However, conventional software simulators are
                 so slow that studying NoCs for emerging many-core
                 systems with hundreds to thousands of cores is
                 challenging. State-of-the-art FPGA-based NoC emulators
                 have shown great potential in speeding up the NoC
                 simulation, but they cannot emulate large-scale NoCs
                 due to the FPGA capacity constraints. Moreover,
                 emulating large-scale NoCs under synthetic workloads on
                 FPGAs typically requires a large amount of memory and
                 thus involves the use of off-chip memory, which makes
                 the overall design much more complicated and may
                 substantially degrade the emulation speed. This article
                 presents methods for fast and cycle-accurate emulation
                 of NoCs with up to thousands of nodes using a single
                 FPGA. We first describe how to emulate a NoC under a
                 synthetic workload using only FPGA on-chip memory
                 (BRAMs). We next present a novel use of time-division
                 multiplexing where BRAMs are effectively used for
                 emulating a network using a small number of nodes,
                 thereby overcoming the FPGA capacity constraints. We
                 propose methods for emulating both direct and indirect
                 networks, focusing on the commonly used meshes and
                 fat-trees ( k -ary n -trees). This is different from
                 prior work that considers only direct networks. Using
                 the proposed methods, we build a NoC emulator, called
                 FNoC, and demonstrate the emulation of some mesh-based
                 and fat-tree-based NoCs with canonical router
                 architectures. Our evaluation results show that (1) the
                 size of the largest NoC that can be emulated depends on
                 only the FPGA on-chip memory capacity; (2) a mesh-based
                 NoC with 16,384 nodes (128$ \times $128 NoC) and a
                 fat-tree-based NoC with 6,144 switch nodes and 4,096
                 terminal nodes (4-ary 6-tree NoC) can be emulated using
                 a single Virtex-7 FPGA; and (3) when emulating these
                 two NoCs, we achieve, respectively, 5,047$ \times $ and
                 232$ \times $ speedups over BookSim, one of the most
                 widely used software-based NoC simulators, while
                 maintaining the same level of accuracy.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "27",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Yoshimi:2017:PPJ,
  author =       "Masato Yoshimi and Yasin Oge and Tsutomu Yoshinaga",
  title =        "Pipelined Parallel Join and Its {FPGA}-Based
                 Acceleration",
  journal =      j-TRETS,
  volume =       "10",
  number =       "4",
  pages =        "28:1--28:??",
  month =        dec,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3079759",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Fri Dec 29 07:28:53 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "A huge amount of data is being generated and
                 accumulated in data centers, which leads to an
                 important increase in the required energy consumption
                 to analyze these data. Thus, we must consider the
                 redesign of current computer systems architectures to
                 be more friendly to applications based on distributed
                 algorithms that require a high data transfer rate.
                 Novel computer architectures that introduce dedicated
                 accelerators to enable near-data processing have been
                 discussed and developed for high-speed big-data
                 analysis. In this work, we propose a computer system
                 with an FPGA-based accelerator, namely,
                 interconnected-FPGAs, which offers two advantages: (1)
                 direct data transmission and (2) offloading computation
                 into data-flow in the FPGA. In this article, we
                 demonstrate the capability of the proposed
                 interconnected-FPGAs system to accelerate join
                 operations in a relational database. We developed a new
                 parallel join algorithm, PPJoin, targeted to big-data
                 analysis in a shared-nothing architecture. PPJoin is an
                 extended version of the NUMA-based parallel join
                 algorithm, created by overlapping computation by
                 multicore processors and data communication. The data
                 communication between computational nodes can be
                 accelerated by direct data transmission without passing
                 through the main memory of the hosts. To confirm the
                 performance of the PPJoin algorithm and its
                 acceleration process using an interconnected-FPGA
                 platform, we evaluated a simple query for large tables.
                 Additionally, to support availability, we also
                 evaluated the actual benchmark query. Our evaluation
                 results confirm that the PPJoin algorithm is faster
                 than a software-based query engine by 1.5--5 times.
                 Moreover, we experimentally confirmed that the direct
                 data transmission by interconnected FPGAs reduces
                 computational time around 20\% for PPJoin.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "28",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Fabry:2017:ERA,
  author =       "Pieter Fabry and David Thomas",
  title =        "Efficient Reconfigurable Architecture for Pricing
                 Exotic Options",
  journal =      j-TRETS,
  volume =       "10",
  number =       "4",
  pages =        "29:1--29:??",
  month =        dec,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3158228",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Fri Dec 29 07:28:53 MST 2017",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "This article presents a new method for Monte Carlo
                 (MC) option pricing using field-programmable gate
                 arrays (FPGAs), which use a discrete-space random walk
                 over a binomial lattice, rather than the continuous
                 space-walks used by existing approaches. The underlying
                 hypothesis is that the discrete-space walk will
                 significantly reduce the area needed for each MC
                 engine, and the resulting increase in parallelisation
                 and raw performance outweighs any accuracy losses
                 introduced by the discretisation. Experimental results
                 support this hypothesis, showing that for a given MC
                 simulation size, there is no significant loss in
                 accuracy by using a discrete space model for the
                 path-dependent exotic financial options. Analysis of
                 the binomial simulation model shows that only
                 limited-precision fixed-point arithmetic is needed, and
                 also shows that pairs of MC kernels are able to share
                 RAM resources. When using realistic constraints on
                 pricing problems, it was found that the size of a
                 discrete-space MC engine can be kept to 370 Flip-Flops
                 and 233 Lookup Tables, allowing up to 3,000
                 variance-reduced MC cores in one FPGA. The combination
                 of a highly parallelisable architecture and
                 model-specific optimisations means that the binomial
                 pricing technique allows for a 50$ \times $ improvement
                 in throughput compared to existing FPGA approaches,
                 without any reduction in accuracy.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "29",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Bakos:2018:ISS,
  author =       "Jason D. Bakos",
  title =        "Introduction to the Special Section on {FCCM'16}",
  journal =      j-TRETS,
  volume =       "11",
  number =       "1",
  pages =        "1:1--1:??",
  month =        mar,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3183572",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Oct 19 17:42:59 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "1e",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Wong:2018:HPI,
  author =       "Henry Wong and Vaughn Betz and Jonathan Rose",
  title =        "High-Performance Instruction Scheduling Circuits for
                 Superscalar Out-of-Order Soft Processors",
  journal =      j-TRETS,
  volume =       "11",
  number =       "1",
  pages =        "1:1--1:??",
  month =        mar,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3093741",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Oct 19 17:42:59 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Soft processors have a role to play in simplifying
                 field-programmable gate array (FPGA) application design
                 as they can be deployed only when needed, and it is
                 easier to write and debug single-threaded software code
                 than create hardware. The breadth of this second role
                 increases when the performance of the soft processor
                 increases, yet the sophisticated out-of-order
                 superscalar approaches that arrived in the mid-1990s
                 are not employed, despite their area cost now being
                 easily tolerable. In this article, we take an important
                 step toward out-of-order execution in soft processors
                 by exploring instruction scheduling in an FPGA
                 substrate. This differs from the hard-processor design
                 problem because the logic substrate is restricted to
                 LUTs, whereas hard processor scheduling circuits employ
                 CAM and wired-OR structures to great benefit. We
                 discuss both circuit and microarchitectural trade-offs
                 and compare three circuit structures for the scheduler,
                 including a new structure called a fused-logic matrix
                 scheduler. Using our optimized circuits, we show that
                 four-issue distributed schedulers with up to 54 entries
                 can be built with the same cycle time as the commercial
                 Nios II/f soft processor (240MHz). This careful design
                 has the potential to significantly increase both the
                 IPC and raw compute performance of a soft processor,
                 compared to current commercial soft processors.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "1",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Davis:2018:KHA,
  author =       "James J. Davis and Eddie Hung and Joshua M. Levine and
                 Edward A. Stott and Peter Y. K. Cheung and George A.
                 Constantinides",
  title =        "{KAPow}: High-Accuracy, Low-Overhead Online Per-Module
                 Power Estimation for {FPGA} Designs",
  journal =      j-TRETS,
  volume =       "11",
  number =       "1",
  pages =        "2:1--2:??",
  month =        mar,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3129789",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Oct 19 17:42:59 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "In an FPGA system-on-chip design, it is often
                 insufficient to merely assess the power consumption of
                 the entire circuit by compile-time estimation or
                 runtime power measurement. Instead, to make better
                 decisions, one must understand the power consumed by
                 each module in the system. In this work, we combine
                 measurements of register-level switching activity and
                 system-level power to build an adaptive online model
                 that produces live breakdowns of power consumption
                 within the design. Online model refinement avoids
                 time-consuming characterization while also allowing the
                 model to track long-term operating condition changes.
                 Central to our method is an automated flow that selects
                 signals predicted to be indicative of high power
                 consumption, instrumenting them for monitoring. We
                 named this technique KAPow, for `K'ounting Activity for
                 Power estimation, which we show to be accurate and to
                 have low overheads across a range of representative
                 benchmarks. We also propose a strategy allowing for the
                 identification and subsequent elimination of counters
                 found to be of low significance at runtime, reducing
                 algorithmic complexity without sacrificing significant
                 accuracy. Finally, we demonstrate an application
                 example in which a module-level power breakdown can be
                 used to determine an efficient mapping of tasks to
                 modules and reduce system-wide power consumption by up
                 to 7\%.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "2",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Giesen:2018:COS,
  author =       "Hans Giesen and Benjamin Gojman and Raphael Rubin and
                 Ji Kim and Andr{\'e} Dehon",
  title =        "Continuous Online Self-Monitoring Introspection
                 Circuitry for Timing Repair by Incremental
                 Partial-Reconfiguration {(COSMIC TRIP)}",
  journal =      j-TRETS,
  volume =       "11",
  number =       "1",
  pages =        "3:1--3:??",
  month =        mar,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3158229",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Oct 19 17:42:59 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "We show that continuously monitoring on-chip delays at
                 the LUT-to-LUT link level during operation allows a
                 field-programmable gate array to detect and self-adapt
                 to aging and environmental timing effects. Using a
                 lightweight ($ < 4 \% $ added area) mechanism for
                 monitoring transition timing, a Difference Detector
                 with First-Fail Latch, we can estimate the timing
                 margin on circuits and identify the individual links
                 that have degraded and whose delay is determining the
                 worst-case circuit delay. Combined with
                 Choose-Your-own-Adventure precomputed, fine-grained
                 repair alternatives, we introduce a strategy for rapid,
                 in-system incremental repair of links with degraded
                 timing. We show that these techniques allow us to
                 respond to a single aging event in less than 190ms for
                 the toronto20 benchmarks. The result is a step toward
                 systems where adaptive reconfiguration on the
                 time-scale of seconds is viable and beneficial.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "3",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Zhao:2018:FGM,
  author =       "Zhuoran Zhao and Nguyen T. H. Nguyen and Dimitris
                 Agiakatsikas and Ganghee Lee and Ediz Cetin+ and Oliver
                 Diessel",
  title =        "Fine-Grained Module-Based Error Recovery in
                 {FPGA}-Based {TMR} Systems",
  journal =      j-TRETS,
  volume =       "11",
  number =       "1",
  pages =        "4:1--4:??",
  month =        mar,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3173549",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Oct 19 17:42:59 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Space processing applications deployed on SRAM-based
                 Field Programmable Gate Arrays (FPGAs) are vulnerable
                 to radiation-induced Single Event Upsets (SEUs).
                 Compared with the well-known SEU mitigation
                 solution-Triple Modular Redundancy (TMR) with
                 configuration memory scrubbing-TMR with module-based
                 error recovery (MER) is notably more energy efficient
                 and responsive in repairing soft-errors in the system.
                 Unfortunately, TMR-MER systems also need to resort to
                 scrubbing when errors occur between sub-components,
                 such as in interconnection nets, which are not
                 recovered by MER. This article addresses this problem
                 by proposing a fine-grained module-based error recovery
                 technique, which can localize and correct errors that
                 classic MER fails to do without additional system
                 hardware. We evaluate our proposal via fault-injection
                 campaigns on three types of circuits implemented in
                 Xilinx 7-Series devices. With respect to scrubbing, we
                 observed reductions in the mean time to repair
                 configuration memory errors of between 48.5\% and
                 89.4\%, while reductions in energy used recovering from
                 configuration memory errors were estimated at between
                 77.4\% and 96.1\%. These improvements result in higher
                 reliability for systems employing TMR with fine-grained
                 reconfiguration than equivalent systems relying on
                 scrubbing for configuration error recovery.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "4",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{AlKadi:2018:GPC,
  author =       "Muhammed {Al Kadi} and Benedikt Janssen and Jones Yudi
                 and Michael Huebner",
  title =        "General-Purpose Computing with Soft {GPUs} on
                 {FPGAs}",
  journal =      j-TRETS,
  volume =       "11",
  number =       "1",
  pages =        "5:1--5:??",
  month =        mar,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3173548",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Oct 19 17:42:59 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 https://www.math.utah.edu/pub/tex/bib/python.bib;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Using field-programmable gate arrays (FPGAs) as a
                 substrate to deploy soft graphics processing units
                 (GPUs) would enable offering the FPGA compute power in
                 a very flexible GPU-like tool flow.
                 Application-specific adaptations like selective
                 hardening of floating-point operations and instruction
                 set subsetting would mitigate the high area and power
                 demands of soft GPUs. This work explores the
                 capabilities and limitations of soft General Purpose
                 Computing on GPUs (GPGPU) for both fixed- and floating
                 point arithmetic. For this purpose, we have developed
                 FGPU: a configurable, scalable, and portable GPU
                 architecture designed especially for FPGAs. FGPU is
                 open-source and implemented entirely in RTL. It can be
                 programmed in OpenCL and controlled through a Python
                 API. This article introduces its hardware architecture
                 as well as its tool flow. We evaluated the proposed
                 GPGPU approach against multiple other solutions. In
                 comparison to homogeneous Multi-Processor
                 System-On-Chips (MPSoCs), we found that using a soft
                 GPU is a Pareto-optimal solution regarding throughput
                 per area and energy consumption. On average, FGPU has a
                 2.9$ \times $ better compute density and 11.2$ \times $
                 less energy consumption than a single MicroBlaze
                 processor when computing in IEEE-754 floating-point
                 format. An average speedup of about 4$ \times $ over
                 the ARM Cortex-A9 supported with the NEON vector
                 co-processor has been measured for fixed- or
                 floating-point benchmarks. In addition, the biggest
                 FGPU cores we could implement on a Xilinx Zynq-7000
                 System-On-Chip (SoC) can deliver similar performance to
                 equivalent implementations with High-Level Synthesis
                 (HLS).",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "5",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Tatsumura:2018:EFM,
  author =       "Kosuke Tatsumura and Sadegh Yazdanshenas and Vaughn
                 Betz",
  title =        "Enhancing {FPGAs} with Magnetic Tunnel Junction-Based
                 Block {RAMs}",
  journal =      j-TRETS,
  volume =       "11",
  number =       "1",
  pages =        "6:1--6:??",
  month =        mar,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3154425",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Oct 19 17:42:59 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "While plentiful on-chip memory is necessary for many
                 designs to fully utilize an FPGA's computational
                 capacity, SRAM scaling is becoming more difficult
                 because of increasing device variation. An alternative
                 is to build FPGA block RAM (BRAM) from magnetic tunnel
                 junctions (MTJ), as this emerging embedded memory has a
                 small cell size, low energy usage, and good
                 scalability. We conduct a detailed comparison study of
                 SRAM and MTJ BRAMs that includes cell designs that are
                 robust with device variation, transistor-level design
                 and optimization of all the required BRAM-specific
                 circuits, and variation-aware simulation at the 22nm
                 node. At a 256Kb block size, MTJ-BRAM is 3.06$ \times $
                 denser and 55\% more energy efficient and its F$_{max}$
                 is 274MHz, which is adequate for most FPGA system clock
                 domains. We also detail further enhancements that allow
                 these 256 Kb MTJ BRAMs to operate at a higher speed of
                 353MHz for the streaming FIFOs, which are very common
                 in FPGA designs and describe how the non-volatility of
                 MTJ BRAM enables novel on-chip configuration and
                 power-down modes. For a RAM architecture similar to the
                 latest commercial FPGAs, MTJ-BRAMs could expand FPGA
                 memory capacity by 2.95$ \times $ with no die size
                 increase.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "6",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Stewart:2018:RPI,
  author =       "Robert Stewart and Kirsty Duncan and Greg Michaelson
                 and Paulo Garcia and Deepayan Bhowmik and Andrew
                 Wallace",
  title =        "{RIPL}: a Parallel Image Processing Language for
                 {FPGAs}",
  journal =      j-TRETS,
  volume =       "11",
  number =       "1",
  pages =        "7:1--7:??",
  month =        mar,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3180481",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Oct 19 17:42:59 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Specialized FPGA implementations can deliver higher
                 performance and greater power efficiency than embedded
                 CPU or GPU implementations for real-time image
                 processing. Programming challenges limit their wider
                 use, because the implementation of FPGA architectures
                 at the register transfer level is time consuming and
                 error prone. Existing software languages supported by
                 high-level synthesis (HLS), although providing a
                 productivity improvement, are too general purpose to
                 generate efficient hardware without the use of
                 hardware-specific code optimizations. Such
                 optimizations leak hardware details into the
                 abstractions that software languages are there to
                 provide, and they require knowledge of FPGAs to
                 generate efficient hardware, such as by using language
                 pragmas to partition data structures across memory
                 blocks. This article presents a thorough account of the
                 Rathlin image processing language (RIPL), a high-level
                 image processing domain-specific language for FPGAs. We
                 motivate its design, based on higher-order algorithmic
                 skeletons, with requirements from the image processing
                 domain. RIPL's skeletons suffice to elegantly describe
                 image processing stencils, as well as recursive
                 algorithms with nonlocal random access patterns. At its
                 core, RIPL employs a dataflow intermediate
                 representation. We give a formal account of the
                 compilation scheme from RIPL skeletons to static and
                 cyclostatic dataflow models to describe their data
                 rates and static scheduling on FPGAs. RIPL compares
                 favorably to the Vivado HLS OpenCV library and C++
                 compiled with Vivado HLS. RIPL achieves between 54 and
                 191 frames per second (FPS) at 100MHz for four
                 synthetic benchmarks, faster than HLS OpenCV in three
                 cases. Two real-world algorithms are implemented in
                 RIPL: visual saliency and mean shift segmentation. For
                 the visual saliency algorithm, RIPL achieves 71 FPS
                 compared to optimized C++ at 28 FPS. RIPL is also
                 concise, being 5x shorter than C++ and 111x shorter
                 than an equivalent direct dataflow implementation. For
                 mean shift segmentation, RIPL achieves 7 FPS compared
                 to optimized C++ on 64 CPU cores at 1.1, and RIPL is
                 10x shorter than the direct dataflow FPGA
                 implementation.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "7",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Khan:2018:EAM,
  author =       "Farheen Fatima Khan and Andy Ye",
  title =        "An Evaluation on the Accuracy of the Minimum-Width
                 Transistor Area Models in Ranking the Layout Area of
                 {FPGA} Architectures",
  journal =      j-TRETS,
  volume =       "11",
  number =       "1",
  pages =        "8:1--8:??",
  month =        mar,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3182394",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Oct 19 17:42:59 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "This work provides an evaluation on the accuracy of
                 the minimum-width transistor area models in ranking the
                 actual layout area of FPGA architectures. Both the
                 original VPR area model and the new COFFE area model
                 are compared against the actual layouts with up to
                 three metal layers for the various FPGA building
                 blocks. We found that both models have significant
                 variations with respect to the accuracy of their
                 predictions across the building blocks. In particular,
                 the original VPR model overestimates the layout area of
                 larger buffers, full adders, and multiplexers by as
                 much as 38\%, while they underestimate the layout area
                 of smaller buffers and multiplexers by as much as 58\%,
                 for an overall prediction error variation of 96\%. The
                 newer COFFE model also significantly overestimates the
                 layout area of full adders by 13\% and underestimates
                 the layout area of multiplexers by a maximum of 60\%
                 for a prediction error variation of 73\%. Such
                 variations are particularly significant considering
                 sensitivity analyses are not routinely performed in
                 FPGA architectural studies. Our results suggest that
                 such analyses are extremely important in studies that
                 employ the minimum-width area models so the tolerance
                 of the architectural conclusions against the prediction
                 error variations can be quantified. Furthermore, an
                 open-source version of the layouts of the actual FPGA
                 building blocks should be created so their actual
                 layout area can be used to achieve a highly accurate
                 ranking of the implementation area of FPGA
                 architectures built upon these layouts.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "8",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Wijesundera:2018:FRP,
  author =       "Deshya Wijesundera and Alok Prakash and Thambipillai
                 Srikanthan and Achintha Ihalage",
  title =        "Framework for Rapid Performance Estimation of Embedded
                 Soft Core Processors",
  journal =      j-TRETS,
  volume =       "11",
  number =       "2",
  pages =        "9:1--9:??",
  month =        nov,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3195801",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Oct 19 17:43:00 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "The large number of embedded soft core processors
                 available today make it tedious and time consuming to
                 select the best processor for a given application. This
                 task is even more challenging due to the numerous
                 configuration options available for a single soft core
                 processor while optimizing for contradicting design
                 requirements such as performance and area. In this
                 article, we propose a generic framework for rapid
                 performance estimation of applications on soft core
                 processors. The proposed technique is scalable to the
                 large number of configuration options available in
                 modern soft core processors by relying on rapid and
                 accurate estimation models instead of time-consuming
                 FPGA synthesis and execution-based techniques.
                 Experimental results on two leading commercial soft
                 core processors executing applications from the widely
                 used CHStone benchmark suite show an average error of
                 less than 6\% while running in the order of minutes
                 when compared to hours taken by synthesis-based
                 techniques.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "9",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Rossi:2018:PPR,
  author =       "Enrico Rossi and Marvin Damschen and Lars Bauer and
                 Giorgio Buttazzo and J{\"o}rg Henkel",
  title =        "Preemption of the Partial Reconfiguration Process to
                 Enable Real-Time Computing With {FPGAs}",
  journal =      j-TRETS,
  volume =       "11",
  number =       "2",
  pages =        "10:1--10:??",
  month =        nov,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3182183",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Oct 19 17:43:00 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "To improve computing performance in real-time
                 applications, modern embedded platforms comprise
                 hardware accelerators that speed up the task's most
                 compute-intensive parts. A recent trend in the design
                 of real-time embedded systems is to integrate
                 field-programmable gate arrays (FPGA) that are
                 reconfigured with different accelerators at runtime, to
                 cope with dynamic workloads that are subject to timing
                 constraints. One of the major limitations when dealing
                 with partial FPGA reconfiguration in real-time systems
                 is that the reconfiguration port can only perform one
                 reconfiguration at a time: if a high-priority task
                 issues a reconfiguration request while the
                 reconfiguration port is already occupied by a
                 lower-priority task, the high-priority task has to wait
                 until the current reconfiguration is completed (a
                 phenomenon known as priority inversion ), unless the
                 current reconfiguration is aborted (introducing
                 unbounded delays in low-priority tasks, a phenomenon
                 known as starvation ). This article shows how priority
                 inversion and starvation can be solved by making the
                 reconfiguration process preemptive -that is, allowing
                 it to be interrupted at any time and resumed at a later
                 time without restarting it from scratch. Such a feature
                 is crucial for the design of runtime reconfigurable
                 real-time systems but not yet available in today's
                 platforms. Furthermore, the trade-off of achieving a
                 guaranteed bound on the reconfiguration delay for
                 low-priority tasks and the maximum delay induced for
                 high-priority tasks when preempting an ongoing
                 reconfiguration has been identified and analyzed.
                 Experimental results on the Xilinx Zynq-7000 platform
                 show that the proposed implementation of preemptive
                 reconfiguration introduces a low runtime overhead, thus
                 effectively solving priority inversion and
                 starvation.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "10",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Petelin:2018:WEF,
  author =       "Oleg Petelin and Vaughn Betz",
  title =        "{Wotan}: Evaluating {FPGA} Architecture Routability
                 without Benchmarks",
  journal =      j-TRETS,
  volume =       "11",
  number =       "2",
  pages =        "11:1--11:??",
  month =        nov,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3195800",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Oct 19 17:43:00 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "FPGA routing architectures consist of routing wires
                 and programmable switches that together account for the
                 majority of the fabric delay and area, making
                 evaluation and optimization of an FPGA's routing
                 architecture very important. Routing architectures have
                 traditionally been evaluated using a full synthesize,
                 pack, place and route CAD flow over a suite of
                 benchmark circuits. While the results are accurate, a
                 full CAD flow has a long runtime and is often tuned to
                 a specific FPGA architecture type, which limits
                 exploration of different architecture options early in
                 the design process. In this article, we present Wotan,
                 a tool to quickly estimate routability for a wide range
                 of architectures without the use of benchmark circuits.
                 At its core, our routability predictor efficiently
                 counts paths through the FPGA routing graph to (1)
                 estimate the probability of node congestion and (2)
                 estimate the probabilities to successfully route a
                 randomized subset of (source, sink) pairs, which are
                 then combined into an overall routability metric. We
                 describe our predictor and present routability
                 estimates for a range of 6-LUT and 4-LUT architectures
                 using mixes of wire types connected in complex ways,
                 showing a rank correlation of 0.91 with routability
                 results from the full VPR CAD flow while requiring 18$
                 \times $ less CPU effort.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "11",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Anandakumar:2018:RHA,
  author =       "N. Nalla Anandakumar and M. Prem Laxman Das and
                 Somitra K. Sanadhya and Mohammad S. Hashmi",
  title =        "Reconfigurable Hardware Architecture for Authenticated
                 Key Agreement Protocol Over Binary {Edwards} Curve",
  journal =      j-TRETS,
  volume =       "11",
  number =       "2",
  pages =        "12:1--12:??",
  month =        nov,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3231743",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Oct 19 17:43:00 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "In this article, we present a high-performance
                 hardware architecture for Elliptic curve based
                 (authenticated) key agreement protocol ``Elliptic Curve
                 Menezes, Qu and Vanstone'' (ECMQV) over Binary Edwards
                 Curve (BEC). We begin by analyzing inversion module on
                 a 251-bit binary field. Subsequently, we present Field
                 Programmable Gate Array (FPGA) implementations of the
                 unified formula for computing elliptic curve point
                 addition on BEC in affine and projective coordinates
                 and investigate the relative performance of these two
                 coordinates. Then, we implement the w -coordinate based
                 differential addition formulae suitable for usage in
                 Montgomery ladder. Next, we present a novel hardware
                 architecture of BEC point multiplication using mixed w
                 -coordinates of the Montgomery laddering algorithm and
                 analyze it in terms of resistance to Simple Power
                 Analysis (SPA) attack. In order to improve the
                 performance, the architecture utilizes registers
                 efficiently and uses efficient scheduling mechanisms
                 for the BEC arithmetic implementations. Our
                 implementation results show that the proposed
                 architecture is resistant against SPA attack and yields
                 a better performance when compared to the existing
                 state-of-the-art BEC designs for computing point
                 multiplication (PM). Finally, we present an FPGA design
                 of ECMQV key agreement protocol using BEC defined over
                 GF(2$^{251}$ ). The execution of ECMQV protocol takes
                 66.47 $ \mu $ s using 32,479 slices on Virtex-4 FPGA
                 and 52.34 $ \mu $ s using 15,988 slices on Virtex-5
                 FPGA. To the best of our knowledge, this is the first
                 FPGA design of the ECMQV protocol using BEC.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "12",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Daigneault:2018:ASS,
  author =       "Marc-Andre Daigneault and Jean Pierre David",
  title =        "Automated Synthesis of Streaming Transfer Level
                 Hardware Designs",
  journal =      j-TRETS,
  volume =       "11",
  number =       "2",
  pages =        "13:1--13:??",
  month =        nov,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3243930",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Oct 19 17:43:00 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "As modern field-programmable gate arrays (FPGA) enable
                 high computing performance and efficiency, their
                 programming with low-level hardware description
                 languages is time-consuming and remains a major
                 obstacle to their adoption. High-level synthesis
                 compilers are able to produce register-transfer-level
                 (RTL) designs from C/C++ algorithmic descriptions, but
                 despite allowing significant design-time improvements,
                 these tools are not always able to generate hardware
                 designs that compare to handmade RTL designs. In this
                 article, we consider synthesis from an
                 intermediate-level (IL) language that allows the
                 description of algorithmic state machines handling
                 connections between streaming sources and sinks.
                 However, the interconnection of streaming sources and
                 sinks can lead to cyclic combinational relations,
                 resulting in undesirable behaviors or un-synthesizable
                 designs. We propose a functional-level methodology to
                 automate the resolution of such cyclic relations into
                 acyclic combinational functions. The proposed IL
                 synthesis methodology has been applied to the design of
                 pipelined floating-point cores. The results obtained
                 show how the proposed IL methodology can simplify the
                 description of pipelined architectures while enabling
                 performances that are close to those achievable through
                 an RTL design methodology.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "13",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Chen:2018:ISS,
  author =       "Deming Chen and Andrew Putnam and Steve Wilton",
  title =        "Introduction to the Special Section on Deep Learning
                 in {FPGAs}",
  journal =      j-TRETS,
  volume =       "11",
  number =       "3",
  pages =        "14:1--14:??",
  month =        dec,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3294768",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Oct 19 17:43:00 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "14",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Prost-Boucle:2018:HEC,
  author =       "Adrien Prost-Boucle and Alban Bourge and
                 Fr{\'e}d{\'e}ric P{\'e}trot",
  title =        "High-Efficiency Convolutional Ternary Neural Networks
                 with Custom Adder Trees and Weight Compression",
  journal =      j-TRETS,
  volume =       "11",
  number =       "3",
  pages =        "15:1--15:??",
  month =        dec,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3270764",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Oct 19 17:43:00 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Although performing inference with artificial neural
                 networks (ANN) was until quite recently considered as
                 essentially compute intensive, the emergence of deep
                 neural networks coupled with the evolution of the
                 integration technology transformed inference into a
                 memory bound problem. This ascertainment being
                 established, many works have lately focused on
                 minimizing memory accesses, either by enforcing and
                 exploiting sparsity on weights or by using few bits for
                 representing activations and weights, to be able to use
                 ANNs inference in embedded devices. In this work, we
                 detail an architecture dedicated to inference using
                 ternary {-1, 0, 1} weights and activations. This
                 architecture is configurable at design time to provide
                 throughput vs. power trade-offs to choose from. It is
                 also generic in the sense that it uses information
                 drawn for the target technologies (memory geometries
                 and cost, number of available cuts, etc.) to adapt at
                 best to the FPGA resources. This allows to achieve up
                 to 5.2k frames per second per Watt for classification
                 on a VC709 board using approximately half of the
                 resources of the FPGA.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "15",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Blott:2018:FRE,
  author =       "Michaela Blott and Thomas B. Preu{\ss}er and Nicholas
                 J. Fraser and Giulio Gambardella and Kenneth O'brien
                 and Yaman Umuroglu and Miriam Leeser and Kees Vissers",
  title =        "{FINN-R}: an End-to-End Deep-Learning Framework for
                 Fast Exploration of Quantized Neural Networks",
  journal =      j-TRETS,
  volume =       "11",
  number =       "3",
  pages =        "16:1--16:??",
  month =        dec,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3242897",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Oct 19 17:43:00 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Convolutional Neural Networks have rapidly become the
                 most successful machine-learning algorithm, enabling
                 ubiquitous machine vision and intelligent decisions on
                 even embedded computing systems. While the underlying
                 arithmetic is structurally simple, compute and memory
                 requirements are challenging. One of the promising
                 opportunities is leveraging reduced-precision
                 representations for inputs, activations, and model
                 parameters. The resulting scalability in performance,
                 power efficiency, and storage footprint provides
                 interesting design compromises in exchange for a small
                 reduction in accuracy. FPGAs are ideal for exploiting
                 low-precision inference engines leveraging custom
                 precisions to achieve the required numerical accuracy
                 for a given application. In this article, we describe
                 the second generation of the FINN framework, an
                 end-to-end tool that enables design-space exploration
                 and automates the creation of fully customized
                 inference engines on FPGAs. Given a neural network
                 description, the tool optimizes for given platforms,
                 design targets, and a specific precision. We introduce
                 formalizations of resource cost functions and
                 performance predictions and elaborate on the
                 optimization algorithms. Finally, we evaluate a
                 selection of reduced precision neural networks ranging
                 from CIFAR-10 classifiers to YOLO-based object
                 detection on a range of platforms including PYNQ and
                 AWS F1, demonstrating new unprecedented measured
                 throughput at 50 TOp/s on AWS F1 and 5 TOp/s on
                 embedded devices.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "16",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Ding:2018:LLH,
  author =       "Ruizhou Ding and Zeye Liu and R. D. (Shawn) Blanton
                 and Diana Marculescu",
  title =        "Lightening the Load with Highly Accurate Storage- and
                 Energy-Efficient {LightNNs}",
  journal =      j-TRETS,
  volume =       "11",
  number =       "3",
  pages =        "17:1--17:??",
  month =        dec,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3270689",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Oct 19 17:43:00 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Hardware implementations of deep neural networks
                 (DNNs) have been adopted in many systems because of
                 their higher classification speed. However, while they
                 may be characterized by better accuracy, larger DNNs
                 require significant energy and area, thereby limiting
                 their wide adoption. The energy consumption of DNNs is
                 driven by both memory accesses and computation.
                 Binarized neural networks (BNNs), as a tradeoff between
                 accuracy and energy consumption, can achieve great
                 energy reduction and have good accuracy for large DNNs
                 due to their regularization effect. However, BNNs show
                 poor accuracy when a smaller DNN configuration is
                 adopted. In this article, we propose a new DNN
                 architecture, LightNN, which replaces the
                 multiplications to one shift or a constrained number of
                 shifts and adds. Our theoretical analysis for LightNNs
                 shows that their accuracy is maintained while
                 dramatically reducing storage and energy requirements.
                 For a fixed DNN configuration, LightNNs have better
                 accuracy at a slight energy increase than BNNs, yet are
                 more energy efficient with only slightly less accuracy
                 than conventional DNNs. Therefore, LightNNs provide
                 more options for hardware designers to trade off
                 accuracy and energy. Moreover, for large DNN
                 configurations, LightNNs have a regularization effect,
                 making them better in accuracy than conventional DNNs.
                 These conclusions are verified by experiment using the
                 MNIST and CIFAR-10 datasets for different DNN
                 configurations. Our FPGA implementation for
                 conventional DNNs and LightNNs confirms all theoretical
                 and simulation results and shows that LightNNs reduce
                 latency and use fewer FPGA resources compared to
                 conventional DNN architectures.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "17",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Meloni:2018:NEC,
  author =       "Paolo Meloni and Alessandro Capotondi and Gianfranco
                 Deriu and Michele Brian and Francesco Conti and Davide
                 Rossi and Luigi Raffo and Luca Benini",
  title =        "{NEURAghe}: Exploiting {CPU--FPGA} Synergies for
                 Efficient and Flexible {CNN} Inference Acceleration on
                 {Zynq SoCs}",
  journal =      j-TRETS,
  volume =       "11",
  number =       "3",
  pages =        "18:1--18:??",
  month =        dec,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3284357",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Oct 19 17:43:00 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Deep convolutional neural networks (CNNs) obtain
                 outstanding results in tasks that require human-level
                 understanding of data, like image or speech
                 recognition. However, their computational load is
                 significant, motivating the development of
                 CNN-specialized accelerators. This work presents NEURA
                 ghe, a flexible and efficient hardware/software
                 solution for the acceleration of CNNs on Zynq SoCs.
                 NEURAghe leverages the synergistic usage of Zynq ARM
                 cores and of a powerful and flexible
                 Convolution-Specific Processor deployed on the
                 reconfigurable logic. The Convolution-Specific
                 Processor embeds both a convolution engine and a
                 programmable soft core, releasing the ARM processors
                 from most of the supervision duties and allowing the
                 accelerator to be controlled by software at an
                 ultra-fine granularity. This methodology opens the way
                 for cooperative heterogeneous computing: While the
                 accelerator takes care of the bulk of the CNN workload,
                 the ARM cores can seamlessly execute hard-to-accelerate
                 parts of the computational graph, taking advantage of
                 the NEON vector engines to further speed up
                 computation. Through the companion NeuDNN SW stack,
                 NEURAghe supports end-to-end CNN-based classification
                 with a peak performance of 169GOps/s, and an energy
                 efficiency of 17GOps/W. Thanks to our heterogeneous
                 computing model, our platform improves upon the
                 state-of-the-art, achieving a frame rate of 5.5 frames
                 per second (fps) on the end-to-end execution of VGG-16
                 and 6.6fps on ResNet-18.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "18",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Liu:2018:OCB,
  author =       "Shuanglong Liu and Hongxiang Fan and Xinyu Niu and
                 Ho-cheung Ng and Yang Chu and Wayne Luk",
  title =        "Optimizing {CNN}-based Segmentation with Deeply
                 Customized Convolutional and Deconvolutional
                 Architectures on {FPGA}",
  journal =      j-TRETS,
  volume =       "11",
  number =       "3",
  pages =        "19:1--19:??",
  month =        dec,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3242900",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Oct 19 17:43:00 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Convolutional Neural Networks-- (CNNs) based
                 algorithms have been successful in solving image
                 recognition problems, showing very large accuracy
                 improvement. In recent years, deconvolution layers are
                 widely used as key components in the state-of-the-art
                 CNNs for end-to-end training and models to support
                 tasks such as image segmentation and super resolution.
                 However, the deconvolution algorithms are
                 computationally intensive, which limits their
                 applicability to real-time applications. Particularly,
                 there has been little research on the efficient
                 implementations of deconvolution algorithms on FPGA
                 platforms that have been widely used to accelerate CNN
                 algorithms by practitioners and researchers due to
                 their high performance and power efficiency. In this
                 work, we propose and develop deconvolution architecture
                 for efficient FPGA implementation. FPGA-based
                 accelerators are proposed for both deconvolution and
                 CNN algorithms. Besides, memory sharing between the
                 computation modules is proposed for the FPGA-based CNN
                 accelerator as well as for other optimization
                 techniques. A non-linear optimization model based on
                 the performance model is introduced to efficiently
                 explore the design space to achieve optimal processing
                 speed of the system and improve power efficiency.
                 Furthermore, a hardware mapping framework is developed
                 to automatically generate the low-latency hardware
                 design for any given CNN model on the target device.
                 Finally, we implement our designs on Xilinx Zynq ZC706
                 board and the deconvolution accelerator achieves a
                 performance of 90.1 giga operations per second (GOPS)
                 under 200MHz working frequency and a performance
                 density of 0.10 GOPS/DSP using 32-bit quantization,
                 which significantly outperforms previous designs on
                 FPGAs. A real-time application of scene segmentation on
                 Cityscapes Dataset is used to evaluate our CNN
                 accelerator on Zynq ZC706 board, and the system
                 achieves a performance of 107 GOPS and 0.12 GOPS/DSP
                 using 16-bit quantization and supports up to 17 frames
                 per second for 512 $ \times $ 512 image inputs with a
                 power consumption of only 9.6W.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "19",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Boutros:2018:YCI,
  author =       "Andrew Boutros and Sadegh Yazdanshenas and Vaughn
                 Betz",
  title =        "You Cannot Improve What You Do not Measure: {FPGA} vs.
                 {ASIC} Efficiency Gaps for Convolutional Neural Network
                 Inference",
  journal =      j-TRETS,
  volume =       "11",
  number =       "3",
  pages =        "20:1--20:??",
  month =        dec,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3242898",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Oct 19 17:43:00 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Recently, deep learning (DL) has become best-in-class
                 for numerous applications but at a high computational
                 cost that necessitates high-performance
                 energy-efficient acceleration. The reconfigurability of
                 FPGAs is appealing due to the rapid change in DL models
                 but also causes lower performance and area-efficiency
                 compared to ASICs. In this article, we implement three
                 state-of-the-art computing architectures (CAs) for
                 convolutional neural network (CNN) inference on FPGAs
                 and ASICs. By comparing the FPGA and ASIC
                 implementations, we highlight the area and performance
                 costs of programmability to pinpoint the inefficiencies
                 in current FPGA architectures. We perform our
                 experiments using three variations of these CAs for
                 AlexNet, VGG-16 and ResNet-50 to allow extensive
                 comparisons. We find that the performance gap varies
                 significantly from 2.8$ \times $ to 6.3$ \times $,
                 while the area gap is consistent across CAs with an 8.7
                 average FPGA-to-ASIC area ratio. Among different blocks
                 of the CAs, the convolution engine, constituting up to
                 60\% of the total area, has a high area ratio ranging
                 from 13 to 31. Motivated by our FPGA vs. ASIC
                 comparisons, we suggest FPGA architectural changes such
                 as increasing DSP block count, enhancing low-precision
                 support in DSP blocks and rethinking the on-chip
                 memories to reduce the programmability gap for DL
                 applications.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "20",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Rouhani:2018:RRT,
  author =       "Bita Darvish Rouhani and Siam Umar Hussain and Kristin
                 Lauter and Farinaz Koushanfar",
  title =        "{ReDCrypt}: Real-Time Privacy-Preserving Deep Learning
                 Inference in Clouds Using {FPGAs}",
  journal =      j-TRETS,
  volume =       "11",
  number =       "3",
  pages =        "21:1--21:??",
  month =        dec,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3242899",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Oct 19 17:43:00 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Artificial Intelligence (AI) is increasingly
                 incorporated into the cloud business in order to
                 improve the functionality (e.g., accuracy) of the
                 service. The adoption of AI as a cloud service raises
                 serious privacy concerns in applications where the risk
                 of data leakage is not acceptable. Examples of such
                 applications include scenarios where clients hold
                 potentially sensitive private information such as
                 medical records, financial data, and/or location. This
                 article proposes ReDCrypt, the first reconfigurable
                 hardware-accelerated framework that empowers
                 privacy-preserving inference of deep learning models in
                 cloud servers. ReDCrypt is well-suited for streaming
                 (a.k.a., real-time AI) settings where clients need to
                 dynamically analyze their data as it is collected over
                 time without having to queue the samples to meet a
                 certain batch size. Unlike prior work, ReDCrypt neither
                 requires to change how AI models are trained nor relies
                 on two non-colluding servers to perform. The
                 privacy-preserving computation in ReDCrypt is executed
                 using Yao's Garbled Circuit (GC) protocol. We break
                 down the deep learning inference task into two phases:
                 (i) privacy-insensitive (local) computation, and (ii)
                 privacy-sensitive (interactive) computation. We devise
                 a high-throughput and power-efficient implementation of
                 GC protocol on FPGA for the privacy-sensitive phase.
                 ReDCrypt's accompanying API provides support for
                 seamless integration of ReDCrypt into any deep learning
                 framework. Proof-of-concept evaluations for different
                 DL applications demonstrate up to 57-fold higher
                 throughput per core compared to the best prior solution
                 with no drop in the accuracy.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "21",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Yu:2018:IDC,
  author =       "Jincheng Yu and Guangjun Ge and Yiming Hu and Xuefei
                 Ning and Jiantao Qiu and Kaiyuan Guo and Yu Wang and
                 Huazhong Yang",
  title =        "Instruction Driven Cross-layer {CNN} Accelerator for
                 Fast Detection on {FPGA}",
  journal =      j-TRETS,
  volume =       "11",
  number =       "3",
  pages =        "22:1--22:??",
  month =        dec,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3283452",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Oct 19 17:43:00 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "In recent years, Convolutional Neural Networks (CNNs)
                 have been widely applied in computer vision and have
                 achieved significant improvements in object detection
                 tasks. Although there are many optimizing methods to
                 speed up CNN-based detection algorithms, it is still
                 difficult to deploy detection algorithms on real-time
                 low-power systems. Field-Programmable Gate Array (FPGA)
                 has been widely explored as a platform for accelerating
                 CNN due to its promising performance, high energy
                 efficiency, and flexibility. Previous works show that
                 the energy consumption of CNN accelerators is dominated
                 by the memory access. By fusing multiple layers in CNN,
                 the intermediate data transfer can be reduced. However,
                 previous accelerators with the cross-layer scheduling
                 are designed for a particular CNN model. In addition to
                 the memory access optimization, the Winograd algorithm
                 can greatly improve the computational performance of
                 convolution. In this article, to improve the
                 flexibility of hardware, we design an
                 instruction-driven CNN accelerator, supporting the
                 Winograd algorithm and the cross-layer scheduling, for
                 object detection. We modify the loop unrolling order of
                 CNN, so that we can schedule a CNN across different
                 layers with instructions and eliminate the intermediate
                 data transfer. We propose a hardware architecture to
                 support the instructions with Winograd computation
                 units and reach the state-of-the-art energy efficiency.
                 To deploy image detection algorithms onto the proposed
                 accelerator with fixed-point computation units, we
                 adopt the fixed-point fine-tune method, which can
                 guarantee the accuracy of the detection algorithms. We
                 evaluate our accelerator and scheduling policy on the
                 Xilinx KU115 FPGA platform. The intermediate data
                 transfer can be reduced by more than 90\% on the VGG-D
                 CNN model with the cross-layer strategy. Thus, the
                 performance of our hardware accelerator reaches
                 1700GOP/s on the classification model VGG-D. We also
                 implement a framework for object detection algorithms,
                 which achieves 2.3$ \times $ and 50$ \times $ in energy
                 efficiency compared with GPU and CPU, respectively.
                 Compared with floating-point algorithms, the accuracy
                 of the fixed-point detection algorithms only drops by
                 less than 1\%.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "22",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Li:2018:EMP,
  author =       "Wensong Li and Fan Yang and Hengliang Zhu and Xuan
                 Zeng and Dian Zhou",
  title =        "An Efficient Memory Partitioning Approach for
                 Multi-Pattern Data Access via Data Reuse",
  journal =      j-TRETS,
  volume =       "12",
  number =       "1",
  pages =        "1:1--1:??",
  month =        apr,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3301296",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Oct 19 17:43:01 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3301296",
  abstract =     "Memory bandwidth has become a bottleneck that impedes
                 performance improvement during the parallelism
                 optimization of the datapath. Memory partitioning is a
                 practical approach to reduce bank-level conflicts and
                 increase the bandwidth on a field-programmable gate
                 array. In this work, we propose a memory partitioning
                 approach for multi-pattern data access. First, we
                 propose to combine multiple patterns into a single
                 pattern to reduce the complexity of multi-pattern.
                 Then, we propose to perform data reuse analysis on the
                 combined pattern to find data reuse opportunities and
                 the non-reusable data pattern. Finally, an efficient
                 bank mapping algorithm with low complexity and low
                 overhead is proposed to find the optimal memory
                 partitioning solution. Experimental results
                 demonstrated that compared to the state-of-the-art
                 method, our proposed approach can reduce the number of
                 block RAMS by 58.9\% on average, with 79.6\% reduction
                 in SLICEs, 85.3\% reduction in LUTs, 67.9\% in
                 reduction Flip-Flops, 54.6\% reduction in DSP48Es,
                 83.9\% reduction in SRLs, 50.0\% reduction in storage
                 overhead, 95.0\% reduction in execution time, and
                 77.3\% reduction in dynamic power consumption on
                 average. Meanwhile, the performance can be improved by
                 14.0\% on average.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "1",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Guo:2018:DSF,
  author =       "Kaiyuan Guo and Shulin Zeng and Jincheng Yu and Yu
                 Wang and Huazhong Yang",
  title =        "{[DL]} A Survey of {FPGA}-based Neural Network
                 Inference Accelerators",
  journal =      j-TRETS,
  volume =       "12",
  number =       "1",
  pages =        "2:1--2:??",
  month =        apr,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3289185",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Oct 19 17:43:01 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3289185",
  abstract =     "Recent research on neural networks has shown a
                 significant advantage in machine learning over
                 traditional algorithms based on handcrafted features
                 and models. Neural networks are now widely adopted in
                 regions like image, speech, and video recognition. But
                 the high computation and storage complexity of neural
                 network inference poses great difficulty on its
                 application. It is difficult for CPU platforms to offer
                 enough computation capacity. GPU platforms are the
                 first choice for neural network processes because of
                 its high computation capacity and easy-to-use
                 development frameworks. However, FPGA-based neural
                 network inference accelerator is becoming a research
                 topic. With specifically designed hardware, FPGA is the
                 next possible solution to surpass GPU in speed and
                 energy efficiency. Various FPGA-based accelerator
                 designs have been proposed with software and hardware
                 optimization techniques to achieve high speed and
                 energy efficiency. In this article, we give an overview
                 of previous work on neural network inference
                 accelerators based on FPGA and summarize the main
                 techniques used. An investigation from software to
                 hardware, from circuit level to system level is carried
                 out to complete analysis of FPGA-based neural network
                 inference accelerator design and serves as a guide to
                 future work.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "2",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Yazdanshenas:2018:CAM,
  author =       "Sadegh Yazdanshenas and Vaughn Betz",
  title =        "{COFFE 2}: Automatic Modelling and Optimization of
                 Complex and Heterogeneous {FPGA} Architectures",
  journal =      j-TRETS,
  volume =       "12",
  number =       "1",
  pages =        "3:1--3:??",
  month =        apr,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3301298",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Oct 19 17:43:01 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3301298",
  abstract =     "FPGAs are becoming more heterogeneous to better adapt
                 to different markets, motivating rapid exploration of
                 different blocks/tiles for FPGAs. To evaluate a new
                 FPGA architectural idea, one should be able to
                 accurately obtain the area, delay, and energy
                 consumption of the block of interest. However, current
                 FPGA circuit design tools can only model simple,
                 homogeneous FPGA architectures with basic logic blocks
                 and also lack DSP and other heterogeneous block
                 support. Modern FPGAs are instead composed of many
                 different tiles, some of which are designed in a full
                 custom style and some of which mix standard cell and
                 full custom styles. To fill this modelling gap, we
                 introduce COFFE 2, an open-source FPGA design toolset
                 for automatic FPGA circuit design. COFFE 2 uses a mix
                 of full custom and standard cell flows and supports not
                 only complex logic blocks with fracturable lookup
                 tables and hard arithmetic but also arbitrary
                 heterogeneous blocks. To validate COFFE 2 and
                 demonstrate its features, we design and evaluate a
                 multi-mode Stratix III-like DSP block and several logic
                 tiles with fracturable LUTs and hard arithmetic. We
                 also demonstrate how COFFE 2's interface to VTR allows
                 full evaluation of block-routing interfaces and various
                 fracturable 6-LUT architectures.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "3",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Choi:2018:DAM,
  author =       "Young-Kyu Choi and Jason Cong and Zhenman Fang and
                 Yuchen Hao and Glenn Reinman and Peng Wei",
  title =        "In-Depth Analysis on Microarchitectures of Modern
                 Heterogeneous {CPU--FPGA} Platforms",
  journal =      j-TRETS,
  volume =       "12",
  number =       "1",
  pages =        "4:1--4:??",
  month =        apr,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3294054",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Oct 19 17:43:01 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3294054",
  abstract =     "Conventional homogeneous multicore processors are not
                 able to provide the continued performance and energy
                 improvement that we have expected from past endeavors.
                 Heterogeneous architectures that feature specialized
                 hardware accelerators are widely considered a promising
                 paradigm for resolving this issue. Among different
                 heterogeneous devices, FPGAs that can be reconfigured
                 to accelerate a broad class of applications with
                 orders-of-magnitude performance/watt gains, are
                 attracting increased attention from both academia and
                 industry. As a consequence, a variety of CPU--FPGA
                 acceleration platforms with diversified
                 microarchitectural features have been supplied by
                 industry vendors. Such diversity, however, poses a
                 serious challenge to application developers in
                 selecting the appropriate platform for a specific
                 application or application domain. This article aims to
                 address this challenge by determining which
                 microarchitectural characteristics affect performance,
                 and in what ways. Specifically, we conduct a
                 quantitative comparison and an in-depth analysis on
                 five state-of-the-art CPU--FPGA acceleration platforms:
                 (1) the Alpha Data board and (2) the Amazon F1 instance
                 that represent the traditional PCIe-based platform with
                 private device memory; (3) the IBM CAPI that represents
                 the PCIe-based system with coherent shared memory; (4)
                 the first generation of the Intel Xeon+FPGA Accelerator
                 Platform that represents the QPI-based system with
                 coherent shared memory; and (5) the second generation
                 of the Intel Xeon+FPGA Accelerator Platform that
                 represents a hybrid PCIe-based (non-coherent) and
                 QPI-based (coherent) system with shared memory. Based
                 on the analysis of their CPU--FPGA communication
                 latency and bandwidth characteristics, we provide a
                 series of insights for both application developers and
                 platform designers. Furthermore, we conduct two case
                 studies to demonstrate how these insights can be
                 leveraged to optimize accelerator designs. The
                 microbenchmarks used for evaluation have been released
                 for public use.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "4",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Cao:2018:FRA,
  author =       "Shijie Cao and Lanshun Nie and Dechen Zhan and
                 Wenqiang Wang and Ningyi Xu and Ramashis Das and Ming
                 Wu and Lintao Zhang and Derek Chiou",
  title =        "{FlexSaaS}: a Reconfigurable Accelerator for {Web}
                 Search Selection",
  journal =      j-TRETS,
  volume =       "12",
  number =       "1",
  pages =        "5:1--5:??",
  month =        apr,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3301409",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Oct 19 17:43:01 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3301409",
  abstract =     "Web search engines deploy large-scale selection
                 services on CPUs to identify a set of web pages that
                 match user queries. An FPGA-based accelerator can
                 exploit various levels of parallelism and provide a
                 lower latency, higher throughput, more energy-efficient
                 solution than commodity CPUs. However, maintaining such
                 a customized accelerator in a commercial search engine
                 is challenging because selection services are changed
                 often. This article presents our design for FlexSaaS
                 (Flexible Selection as a Service), an FPGA-based
                 accelerator for web search selection. To address
                 efficiency and flexibility challenges, FlexSaaS
                 abstracts computing models and separates memory access
                 from computation. Specifically, FlexSaaS (i) contains a
                 reconfigurable number of matching processors that can
                 handle various possible query plans, (ii) decouples
                 index stream reading from matching computation to fetch
                 and decode index files, and (iii) includes a universal
                 memory accessor that hides the complex memory hierarchy
                 and reduces host data access latency. Evaluated on
                 FPGAs in the selection service of a commercial web
                 search--the Bing web search engine-FlexSaaS can be
                 evolved quickly to adapt to new updates. Compared to
                 the software baseline, FlexSaaS on Arria 10 reduces
                 average latency by 30\% and increases throughput by
                 1.5$ \times $.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "5",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Liu:2019:PFF,
  author =       "Gai Liu and Zhiru Zhang",
  title =        "{PIMap}: a Flexible Framework for Improving
                 {LUT}-Based Technology Mapping via Parallelized
                 Iterative Optimization",
  journal =      j-TRETS,
  volume =       "11",
  number =       "4",
  pages =        "23:1--23:??",
  month =        jan,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3268344",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Oct 19 17:43:01 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3268344",
  abstract =     "Modern FPGA synthesis tools typically apply a
                 predetermined sequence of logic optimizations on the
                 input logic network before carrying out technology
                 mapping. While the ``known recipes'' of logic
                 transformations often lead to improved mapping results,
                 there remains a nontrivial gap between the quality
                 metrics driving the pre-mapping logic optimizations and
                 those targeted by the actual technology mapping.
                 Needless to mention, such miscorrelations would
                 eventually result in suboptimal quality of results. In
                 this article, we propose PIMap, which couples logic
                 transformations and technology mapping under an
                 iterative improvement framework for LUT-based FPGAs. In
                 each iteration, PIMap randomly proposes a
                 transformation on the given logic network from an
                 ensemble of candidate optimizations; it then invokes
                 technology mapping and makes use of the mapping result
                 to determine the likelihood of accepting the proposed
                 transformation. By adjusting the optimization objective
                 and incorporating required time constraints during the
                 iterative process, PIMap can flexibly optimize for
                 different objectives including area minimization, delay
                 optimization, and delay-constrained area reduction. To
                 mitigate the runtime overhead, we further introduce
                 parallelization techniques to decompose a large design
                 into multiple smaller sub-netlists that can be
                 optimized simultaneously. Experimental results show
                 that PIMap achieves promising quality improvement over
                 a set of commonly used benchmarks, including improving
                 the majority of the best-known area and delay records
                 for the EPFL benchmark suite.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "23",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Wang:2019:FBA,
  author =       "Haomiao Wang and Prabu Thiagaraj and Oliver Sinnen",
  title =        "{FPGA}-based Acceleration of {FT} Convolution for
                 Pulsar Search Using {OpenCL}",
  journal =      j-TRETS,
  volume =       "11",
  number =       "4",
  pages =        "24:1--24:??",
  month =        jan,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3268933",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Oct 19 17:43:01 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3268933",
  abstract =     "The Square Kilometre Array (SKA) project will be the
                 world's largest radio telescope array. With its large
                 number of antennas, the number of signals that need to
                 be processed is dramatic. One important element of the
                 SKA's Central Signal Processor package is pulsar
                 search. This article focuses on the FPGA-based
                 acceleration of the Frequency-Domain Acceleration
                 Search module, which is a part of SKA pulsar search
                 engine. In this module, the frequency-domain input
                 signals have to be processed by 85 Finite Impulse
                 response (FIR) filters within a short period of
                 limitation and for thousands of input arrays. Because
                 of the large scale of the input length and FIR filter
                 size, even high-end FPGA devices cannot parallelise the
                 task completely. We start by investigating both
                 time-domain FIR filter (TDFIR) and frequency-domain FIR
                 filter (FDFIR) to tackle this task. We applied the
                 overlap-add algorithm to split the coefficient array of
                 TDFIR and the overlap-save algorithm to split the input
                 signals of FDFIR. To achieve fast prototyping design,
                 we employed OpenCL, which is a high-level FPGA
                 development technique. The performance and power
                 consumption are evaluated using multiple FPGA devices
                 simultaneously and compared with GPU results, which is
                 achieved by porting FPGA-based OpenCL kernels. The
                 experimental evaluation shows that the FDFIR solution
                 is very competitive in terms of performance, with a
                 clear energy consumption advantage over the GPU
                 solution.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "24",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Kroh:2019:EFG,
  author =       "Alexander Kroh and Oliver Diessel",
  title =        "Efficient Fine-grained Processor-logic Interactions on
                 the Cache-coherent {Zynq} Platform",
  journal =      j-TRETS,
  volume =       "11",
  number =       "4",
  pages =        "25:1--25:??",
  month =        jan,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3277506",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Oct 19 17:43:01 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3277506",
  abstract =     "The introduction of cache-coherent processor-logic
                 interconnects in CPU--FPGA platforms promises
                 low-latency communication between CPU and FPGA fabrics.
                 This reduced latency improves the performance of
                 heterogeneous systems implemented on such devices and
                 gives rise to new software architectures that can
                 better use the available hardware. Via an extended
                 study accelerating the software task scheduler of a
                 microkernel operating system, this article reports on
                 the potential for accelerating applications that
                 exhibit fine-grained interactions. In doing so, we
                 evaluate the performance of direct and cache-coherent
                 communication methods for applications that involve
                 frequent, low-bandwidth transactions between CPU and
                 programmable logic. In the specific case we studied, we
                 found that replacing a highly optimised software
                 implementation of the task scheduler with an FPGA-based
                 scheduler reduces the cost of communication between two
                 software threads by 5.5\%. We also found that, while
                 hardware acceleration reduces cache footprint, we still
                 observe execution time variability because of other
                 non-deterministic features of the CPU.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "25",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Dumpala:2019:LUE,
  author =       "Naveen Kumar Dumpala and Shivukumar B. Patil and
                 Daniel Holcomb and Russell Tessier",
  title =        "Loop Unrolling for Energy Efficiency in Low-Cost
                 Field-Programmable Gate Arrays",
  journal =      j-TRETS,
  volume =       "11",
  number =       "4",
  pages =        "26:1--26:??",
  month =        jan,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3289186",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Oct 19 17:43:01 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3289186",
  abstract =     "Field-programmable gate arrays (FPGAs) are used for a
                 wide variety of computations in low-cost embedded
                 systems. Although these systems often have modest
                 performance constraints, their energy consumption must
                 typically be limited. Many FPGA applications employ
                 repetitive loops that cannot be straightforwardly split
                 into parallel computations. Performing a loop
                 sequentially generally requires high-speed clocks that
                 consume considerable clock power and sometimes require
                 clock generation using a phase-locked loop (PLL). Loop
                 unrolling addresses the high-speed clock issue, but its
                 use often leads to significant combinational glitch
                 power. In this work, a computer-aided design (CAD)
                 approach that unrolls loops for designs targeted to
                 low-cost FPGAs is described. Our approach considers
                 latency constraints in an effort to minimize energy
                 consumption for loop-based computation. To reduce
                 glitch power, a glitch-filtering approach is introduced
                 that provides a balance between glitch reduction and
                 design performance. Glitch-filter enable signals are
                 generated and routed to the filters using resources
                 best suited to the target FPGA. Our approach
                 automatically inserts glitch filters and associated
                 control logic into a design prior to processing with
                 FPGA synthesis, place, and route tools. Our
                 energy-saving loop-unrolling approach has been
                 evaluated using five benchmarks often used in low-cost
                 FPGAs. The energy-saving capabilities of the approach
                 have been evaluated for an Intel Cyclone IV and a
                 Xilinx Artix-7 FPGA using board-level power
                 measurement. The use of unrolling and glitch filtering
                 is shown to reduce energy by at least 65\% for an
                 Artix-7 device and 50\% for a Cyclone IV device while
                 meeting design latency constraints.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "26",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Chen:2019:EMN,
  author =       "Deming Chen",
  title =        "Editorial: a Message from the New {Editor-in-Chief}",
  journal =      j-TRETS,
  volume =       "12",
  number =       "2",
  pages =        "6:1--6:??",
  month =        jun,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3326451",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Oct 19 17:43:01 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3326451",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "6e",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Morcel:2019:FAC,
  author =       "Raghid Morcel and Hazem Hajj and Mazen A. R. Saghir
                 and Haitham Akkary and Hassan Artail and Rahul Khanna
                 and Anil Keshavamurthy",
  title =        "{FeatherNet}: an Accelerated Convolutional Neural
                 Network Design for Resource-constrained {FPGAs}",
  journal =      j-TRETS,
  volume =       "12",
  number =       "2",
  pages =        "6:1--6:??",
  month =        jun,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3306202",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Oct 19 17:43:01 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3306202",
  abstract =     "Convolutional Neural Network (ConvNet or CNN)
                 algorithms are characterized by a large number of model
                 parameters and high computational complexity. These two
                 requirements have made it challenging for
                 implementations on resource-limited FPGAs. The
                 challenges are magnified when considering designs for
                 low-end FPGAs. While previous work has demonstrated
                 successful ConvNet implementations with high-end FPGAs,
                 this article presents a ConvNet accelerator design that
                 enables the implementation of complex deep ConvNet
                 architectures on resource-constrained FPGA platforms
                 aimed at the IoT market. We call the design
                 ``FeatherNet'' for its light resource utilization. The
                 implementations are VHDL-based providing flexibility in
                 design optimizations. As part of the design process,
                 new methods are introduced to address several design
                 challenges. The first method is a novel stride-aware
                 graph-based method targeted at ConvNets that aims at
                 achieving efficient signal processing with reduced
                 resource utilization. The second method addresses the
                 challenge of determining the minimal precision
                 arithmetic needed while preserving high accuracy. For
                 this challenge, we propose variable-width dynamic
                 fixed-point representations combined with a
                 layer-by-layer design-space pruning heuristic across
                 the different layers of the deep ConvNet model. The
                 third method aims at achieving a modular design that
                 can support different types of ConvNet layers while
                 ensuring low resource utilization. For this challenge,
                 we propose the modules to be relatively small and
                 composed of computational filters that can be
                 interconnected to build an entire accelerator design.
                 These model elements can be easily configured through
                 HDL parameters (e.g., layer type, mask size, stride,
                 etc.) to meet the needs of specific ConvNet
                 implementations and thus they can be reused to
                 implement a wide variety of ConvNet architectures. The
                 fourth method addresses the challenge of design
                 portability between two different FPGA vendor
                 platforms, namely, Intel/Altera and Xilinx. For this
                 challenge, we propose to instantiate the
                 device-specific hardware blocks needed in each
                 computational filter, rather than relying on the
                 synthesis tools to infer these blocks, while keeping
                 track of the similarities and differences between the
                 two platforms. We believe that the solutions to these
                 design challenges further advance knowledge as they can
                 benefit designers and other researchers using similar
                 devices or facing similar challenges. Our results
                 demonstrated the success of addressing the design
                 challenges and achieving low (30\%) resource
                 utilization for the low-end FPGA platforms: Zedboard
                 and Cyclone V. The design overcame the limitation of
                 designs targeted for high-end platforms and that cannot
                 fit on low-end IoT platforms. Furthermore, our design
                 showed superior performance results (measured in terms
                 of [Frame/s/W] per Dollar) compared to high-end
                 optimized designs.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "6",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Zhou:2019:FAN,
  author =       "Xuegong Zhou and Lingli Wang and Alan Mishchenko",
  title =        "Fast Adjustable {NPN} Classification Using Generalized
                 Symmetries",
  journal =      j-TRETS,
  volume =       "12",
  number =       "2",
  pages =        "7:1--7:??",
  month =        jun,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3313917",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Oct 19 17:43:01 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3313917",
  abstract =     "NPN classification of Boolean functions is a powerful
                 technique used in many logic synthesis and technology
                 mapping tools in both standard cell and FPGA design
                 flows. Computing the canonical form is the most common
                 approach of Boolean function classification. This
                 article proposes two different hybrid NPN canonical
                 forms and a new algorithm to compute them. By
                 exploiting symmetries under different phase assignment
                 as well as higher-order symmetries, the search space of
                 NPN canonical form computation is pruned and the
                 runtime is dramatically reduced. Nevertheless, the
                 runtime for some difficult functions remains high. Fast
                 heuristic method can be used for such functions to
                 compute semi-canonical forms in a reasonable time. The
                 proposed algorithm can be adjusted to be a slow exact
                 algorithm or a fast heuristic algorithm with lower
                 quality. For exact NPN classification, the proposed
                 algorithm is 40$ \times $ faster than state-of-the-art.
                 For heuristic classification, the proposed algorithm
                 has similar performance as state-of-the-art with a
                 possibility to trade runtime for quality.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "7",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Oppermann:2019:EPM,
  author =       "Julian Oppermann and Melanie Reuter-Oppermann and
                 Lukas Sommer and Andreas Koch and Oliver Sinnen",
  title =        "Exact and Practical Modulo Scheduling for High-Level
                 Synthesis",
  journal =      j-TRETS,
  volume =       "12",
  number =       "2",
  pages =        "8:1--8:??",
  month =        jun,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3317670",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Oct 19 17:43:01 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3317670",
  abstract =     "Loop pipelining is an essential technique in
                 high-level synthesis to increase the throughput and
                 resource utilisation of field-programmable gate
                 array--based accelerators. It relies on modulo
                 schedulers to compute an operator schedule that allows
                 subsequent loop iterations to overlap partially when
                 executed while still honouring all precedence and
                 resource constraints. Modulo schedulers face a
                 bi-criteria problem: minimise the initiation interval
                 (II; i.e., the number of timesteps after which new
                 iterations are started) and minimise the schedule
                 length. We present Moovac, a novel exact formulation
                 that models all aspects (including the II minimisation)
                 of the modulo scheduling problem as a single integer
                 linear program, and discuss simple measures to prevent
                 excessive runtimes, to challenge the old preconception
                 that exact modulo scheduling is impractical. We
                 substantiate this claim by conducting an experimental
                 study covering 188 loops from two established
                 high-level synthesis benchmark suites, four different
                 time limits, and three bounds for the schedule length,
                 to compare our approach against a highly tuned exact
                 formulation and a state-of-the-art heuristic algorithm.
                 In the fastest configuration, an accumulated runtime of
                 under 16 minutes is spent on scheduling all loops, and
                 proven optimal IIs are found for 179 test instances.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "8",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Bo:2019:APR,
  author =       "Chunkun Bo and Vinh Dang and Ted Xie and Jack Wadden
                 and Mircea Stan and Kevin Skadron",
  title =        "Automata Processing in Reconfigurable Architectures:
                 In-the-Cloud Deployment, Cross-Platform Evaluation, and
                 Fast Symbol-Only Reconfiguration",
  journal =      j-TRETS,
  volume =       "12",
  number =       "2",
  pages =        "9:1--9:??",
  month =        jun,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3314576",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Oct 19 17:43:01 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3314576",
  abstract =     "We present a general automata processing framework on
                 FPGAs, which generates an RTL kernel for automata
                 processing together with an AXI and PCIe based I/O
                 circuitry. We implement the framework on both local
                 nodes and cloud platforms (Amazon AWS and Nimbix) with
                 novel features. A full performance comparison of the
                 proposed framework is conducted against
                 state-of-the-art automata processing engines on CPUs,
                 GPUs, and Micron's Automata Processor using the ANMLZoo
                 benchmark suite and some real-world datasets. Results
                 show that FPGAs enable extremely high-throughput
                 automata processing compared to von Neumann
                 architectures. We also collect the resource utilization
                 and power consumption on the two cloud platforms, and
                 find that the I/O circuitry consumes most of the
                 hardware resources and power. Furthermore, we propose a
                 fast, symbol-only reconfiguration mechanism based on
                 the framework for large pattern sets that cannot fit on
                 a single device and need to be partitioned. The
                 proposed method supports multiple passes of the input
                 stream and reduces the re-compilation cost from hours
                 to seconds.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "9",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Dinh:2019:NFI,
  author =       "Van Luan Dinh and Xuan Truong Nguyen and Hyuk-Jae
                 Lee",
  title =        "A Novel {FPGA} Implementation of a Time-to-Digital
                 Converter Supporting Run-Time Estimation and
                 Compensation",
  journal =      j-TRETS,
  volume =       "12",
  number =       "2",
  pages =        "10:1--10:??",
  month =        jun,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3322482",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Oct 19 17:43:01 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3322482",
  abstract =     "Time-to-digital converters (TDCs) are widely used in
                 applications that require the measurement of the time
                 interval between events. In previous designs using a
                 feedback loop and an extended delay line,
                 process-voltage-temperature (PVT) variation often
                 decreases the accuracy of measurements. To overcome the
                 loss of accuracy caused by PVT variation, this study
                 proposes a novel design of a synthesizable TDC that
                 employs run-time estimation and compensation of PVT
                 variation. A delay line consisting of a series of
                 buffers is used to detect the period of a ring
                 oscillator designed to measure the time interval
                 between two events. By comparing the detected period
                 and the system clock, the variation of the oscillation
                 period is compensated at run-time. The proposed TDC is
                 successfully implemented by using a low-cost Xilinx
                 Spartan-6 LX9 FPGA with a 50-MHz oscillator.
                 Experimental results show that the proposed TDC is
                 robust to PVT variation with a resolution of 19.1 ps.
                 In comparison with previous design, the proposed TDC
                 achieves about five times better tradeoff in the area,
                 resolution, and frequency of the reference clock.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "10",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Bobda:2019:ISS,
  author =       "Chistophe Bobda and Ken Eguro",
  title =        "Introduction to the Special Section on Security in
                 {FPGA}-accelerated Cloud and Datacenters",
  journal =      j-TRETS,
  volume =       "12",
  number =       "3",
  pages =        "11:1--11:??",
  month =        sep,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3352060",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Oct 19 17:43:02 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3352060",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "11e",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Giechaskiel:2019:LWE,
  author =       "Ilias Giechaskiel and Ken Eguro and Kasper B.
                 Rasmussen",
  title =        "Leakier Wires: Exploiting {FPGA} Long Wires for
                 Covert- and Side-channel Attacks",
  journal =      j-TRETS,
  volume =       "12",
  number =       "3",
  pages =        "11:1--11:??",
  month =        sep,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3322483",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Oct 19 17:43:02 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3322483",
  abstract =     "In complex FPGA designs, implementations of algorithms
                 and protocols from third-party sources are common.
                 However, the monolithic nature of FPGAs means that all
                 sub-circuits share common on-chip infrastructure, such
                 as routing resources. This presents an attack vector
                 for all FPGAs that contain designs from multiple
                 vendors, especially for FPGAs used in multi-tenant
                 cloud environments, or integrated into multi-core
                 processors. In this article, we show that ``long''
                 routing wires present a new source of information
                 leakage on FPGAs, by influencing the delay of adjacent
                 long wires. We show that the effect is measurable for
                 both static and dynamic signals and that it can be
                 detected using small on-board circuits. We characterize
                 the channel in detail and show that it is measurable
                 even when multiple competing circuits (including
                 multiple long-wire transmitters) are present and can be
                 replicated on different generations and families of
                 Xilinx devices (Virtex 5, Virtex 6, Artix 7, and
                 Spartan 7). We exploit the leakage to create a covert
                 channel with 6kbps of bandwidth and 99.9\% accuracy,
                 and a side channel, which can recover signals kept
                 constant for only 1.3s $ \mu $ s, with an accuracy of
                 more than 98.4\%. Finally, we propose countermeasures
                 to reduce the impact of this leakage.$^1$",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "11",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Krautter:2019:MEL,
  author =       "Jonas Krautter and Dennis R. E. Gnad and Mehdi B.
                 Tahoori",
  title =        "Mitigating Electrical-level Attacks towards Secure
                 Multi-Tenant {FPGAs} in the Cloud",
  journal =      j-TRETS,
  volume =       "12",
  number =       "3",
  pages =        "12:1--12:??",
  month =        sep,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3328222",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Oct 19 17:43:02 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3328222",
  abstract =     "A rising trend is the use of multi-tenant FPGAs,
                 particularly in cloud environments, where partial
                 access to the hardware is given to multiple third
                 parties. This leads to new types of attacks in FPGAs,
                 which operate not only on the logic level, but also on
                 the electrical level through the common power delivery
                 network. Since FPGAs are configured from the
                 software-side, attackers are enabled to launch hardware
                 attacks from software, impacting the security of an
                 entire system. In this article, we show the first
                 attempt of a countermeasure against attacks on the
                 electrical level, which is based on a bitstream
                 checking methodology. Bitstreams are translated back
                 into flat technology mapped netlists, which are then
                 checked for properties that indicate potential
                 malicious runtime behavior of FPGA logic. Our approach
                 can provide a metric of potential risk of the FPGA
                 bitstream being used in active fault or passive
                 side-channel attacks against other users of the FPGA
                 fabric or the entire SoC platform.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "12",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Elrabaa:2019:PPP,
  author =       "Muhammad E. S. Elrabaa and Mohamed A. Al-Asli and
                 Marwan H. Abu-Amara",
  title =        "A Protection and Pay-per-use Licensing Scheme for
                 On-cloud {FPGA} Circuit {IPs}",
  journal =      j-TRETS,
  volume =       "12",
  number =       "3",
  pages =        "13:1--13:??",
  month =        sep,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3329861",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Oct 19 17:43:02 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3329861",
  abstract =     "Using security primitives, a novel scheme for
                 licensing hardware intellectual properties (HWIPs) on
                 Field Programmable Gate Arrays (FPGAs) in public clouds
                 is proposed. The proposed scheme enforces a pay-per-use
                 model, allows HWIP's installation only on specific
                 on-cloud FPGAs, and efficiently protects the HWIPs from
                 being cloned, reverse engineered, or used without the
                 owner's authorization by any party, including a cloud
                 insider. It also provides protection for the users'
                 designs integrated with the HWIP on the same FPGA. This
                 enables cloud tenants to license HWIPs in the cloud
                 from the HWIP vendors at a relatively low price based
                 on usage instead of paying the expensive unlimited HWIP
                 license fee. The scheme includes a protocol for FPGA
                 authentication, HWIP secure decryption, and usage by
                 the clients without the need for the HWIP vendor to be
                 involved or divulge their secret keys. A complete
                 prototype test-bed implementation showed that the
                 proposed scheme is very feasible with relatively low
                 resource utilization. Experiments also showed that a
                 HWIP could be licensed and set up in the on-cloud FPGA
                 in 0.9s. This is 15 times faster than setting up the
                 same HWIP from outside the cloud, which takes about 14s
                 based on the average global Internet speed.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "13",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Zhang:2019:RAD,
  author =       "Jiliang Zhang and Gang Qu",
  title =        "Recent Attacks and Defenses on {FPGA}-based Systems",
  journal =      j-TRETS,
  volume =       "12",
  number =       "3",
  pages =        "14:1--14:??",
  month =        sep,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3340557",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Oct 19 17:43:02 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3340557",
  abstract =     "Field-programmable gate array (FPGA) is a kind of
                 programmable chip that is widely used in many areas,
                 including automotive electronics, medical devices,
                 military and consumer electronics, and is gaining more
                 popularity. Unlike the application specific integrated
                 circuits (ASIC) design, an FPGA-based system has its
                 own supply-chain model and design flow, which brings
                 interesting security and trust challenges. In this
                 survey, we review the security and trust issues related
                 to FPGA-based systems from the market perspective,
                 where we model the market with the following parties:
                 FPGA vendors, foundries, IP vendors, EDA tool vendors,
                 FPGA-based system developers, and end-users. For each
                 party, we show the security and trust problems they
                 need to be aware of and the associated solutions that
                 are available. We also discuss some challenges and
                 opportunities in the security and trust of FPGA-based
                 systems used in large-scale cloud and datacenters.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "14",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Umuroglu:2019:OBS,
  author =       "Yaman Umuroglu and Davide Conficconi and Lahiru
                 Rasnayake and Thomas B. Preusser and Magnus
                 Sj{\"a}lander",
  title =        "Optimizing Bit-Serial Matrix Multiplication for
                 Reconfigurable Computing",
  journal =      j-TRETS,
  volume =       "12",
  number =       "3",
  pages =        "15:1--15:??",
  month =        sep,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3337929",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Oct 19 17:43:02 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3337929",
  abstract =     "Matrix--matrix multiplication is a key computational
                 kernel for numerous applications in science and
                 engineering, with ample parallelism and data locality
                 that lends itself well to high-performance
                 implementations. Many matrix multiplication-dependent
                 applications can use reduced-precision integer or
                 fixed-point representations to increase their
                 performance and energy efficiency while still offering
                 adequate quality of results. However, precision
                 requirements may vary between different application
                 phases or depend on input data, rendering
                 constant-precision solutions ineffective. BISMO, a
                 vectorized bit-serial matrix multiplication overlay for
                 reconfigurable computing, previously utilized the
                 excellent binary-operation performance of FPGAs to
                 offer a matrix multiplication performance that scales
                 with required precision and parallelism. We show how
                 BISMO can be scaled up on Xilinx FPGAs using an
                 arithmetic architecture that better utilizes six-input
                 LUTs. The improved BISMO achieves a peak performance of
                 15.4 binary TOPS on the Ultra96 board with a Xilinx
                 UltraScale+ MPSoC.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "15",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Al-Hyari:2019:NCE,
  author =       "Abeer Al-Hyari and Ziad Abuowaimer and Timothy Martin
                 and Gary Gr{\'e}wal and Shawki Areibi and Anthony
                 Vannelli",
  title =        "Novel Congestion-estimation and Routability-prediction
                 Methods based on Machine Learning for Modern {FPGAs}",
  journal =      j-TRETS,
  volume =       "12",
  number =       "3",
  pages =        "16:1--16:??",
  month =        sep,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3337930",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Oct 19 17:43:02 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3337930",
  abstract =     "Effectively estimating and managing congestion during
                 placement can save substantial placement and routing
                 runtime. In this article, we present a machine-learning
                 model for accurately and efficiently estimating
                 congestion during FPGA placement. Compared with the
                 state-of-the-art machine-learning congestion-estimation
                 model, our results show a 25\% improvement in
                 prediction accuracy. This makes our model competitive
                 with congestion estimates produced using a global
                 router. However, our model runs, on average, 291$
                 \times $ faster than the global router. Overall, we are
                 able to reduce placement runtimes by 17\% and router
                 runtimes by 19\%. An additional machine-learning model
                 is also presented that uses the output of the first
                 congestion-estimation model to determine whether or not
                 a placement is routable. This second model has an
                 accuracy in the range of 93\% to 98\%, depending on the
                 classification algorithm used to implement the learning
                 model, and runtimes of a few milliseconds, thus making
                 it suitable for inclusion in any placer with no worry
                 of additional computational overhead.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "16",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Owaida:2019:DID,
  author =       "Muhsen Owaida and Amit Kulkarni and Gustavo Alonso",
  title =        "Distributed Inference over Decision Tree Ensembles on
                 Clusters of {FPGAs}",
  journal =      j-TRETS,
  volume =       "12",
  number =       "4",
  pages =        "17:1--17:??",
  month =        oct,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3340263",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Oct 19 17:43:02 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3340263",
  abstract =     "Given the growth in data inputs and application
                 complexity, it is often the case that a single hardware
                 accelerator is not enough to solve a given problem. In
                 particular, the computational demands and I/O of many
                 tasks in machine learning often require a cluster of
                 accelerators to make a relevant difference in
                 performance. In this article, we explore the efficient
                 construction of FPGA clusters using inference over
                 Decision Tree Ensembles as the target application. The
                 article explores several levels of the problem: (1) a
                 lightweight inter-FPGA communication protocol and
                 routing layer to facilitate the communication between
                 the different FPGAs, (2) the data partitioning and
                 distribution strategies maximizing performance, (3) and
                 an in depth analysis on how applications can be
                 efficiently distributed over such a cluster. The
                 experimental analysis shows that the resulting system
                 can support inference over decision tree ensembles at a
                 significantly higher throughput than that achieved by
                 existing systems.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "17",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Ahmed:2019:FAB,
  author =       "Ibrahim Ahmed and Shuze Zhao and James Meijers and
                 Olivier Trescases and Vaughn Betz",
  title =        "{FRoC 2.0}: Automatic {BRAM} and Logic Testing to
                 Enable Dynamic Voltage Scaling for {FPGA}
                 Applications",
  journal =      j-TRETS,
  volume =       "12",
  number =       "4",
  pages =        "20:1--20:??",
  month =        oct,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3354188",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Oct 19 17:43:02 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3354188",
  abstract =     "In earlier technology nodes, FPGAs had low power
                 consumption compared to other compute chips such as
                 CPUs and GPUs. However, in the 14nm technology node,
                 FPGAs are consuming unprecedented power in the 100+W
                 range, making power consumption a pressing concern. To
                 reduce FPGA power consumption, several researchers have
                 proposed deploying dynamic voltage scaling. While the
                 previously proposed solutions show promising results,
                 they have difficulty guaranteeing safe operation at
                 reduced voltages for applications that use the FPGA
                 hard blocks. In this work, we present the first DVS
                 solution that is able to fully handle FPGA applications
                 that use BRAMs. Our solution not only robustly tests
                 the soft logic component of the application but also
                 tests all components connected to the BRAMs. We extend
                 a previously proposed CAD tool, FRoC, to automatically
                 generate calibration bitstreams that are used to
                 measure the application's critical path delays on
                 silicon. The calibration bitstreams also include
                 testers that ensure all used SRAM cells operate safely
                 while scaling V$_{dd}$. We experimentally show that
                 using our DVS solution we can save 32\% of the total
                 power consumed by a discrete Fourier transform
                 application running with the fixed nominal supply
                 voltage and clocked at the F$_{max}$ reported by static
                 timing analysis.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "20",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Tridgell:2019:UTN,
  author =       "Stephen Tridgell and Martin Kumm and Martin Hardieck
                 and David Boland and Duncan Moss and Peter Zipf and
                 Philip H. W. Leong",
  title =        "Unrolling Ternary Neural Networks",
  journal =      j-TRETS,
  volume =       "12",
  number =       "4",
  pages =        "22:1--22:??",
  month =        oct,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3359983",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Oct 19 17:43:02 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "The computational complexity of neural networks for
                 large-scale or real-time applications necessitates
                 hardware acceleration. Most approaches assume that the
                 network architecture and parameters are unknown at
                 design time, permitting usage in a large number of
                 applications. This article demonstrates, for the case
                 where the neural network architecture and ternary
                 weight values are known a priori, that extremely high
                 throughput implementations of neural network inference
                 can be made by customising the datapath and routing to
                 remove unnecessary computations and data movement. This
                 approach is ideally suited to FPGA implementations as a
                 specialized implementation of a trained network
                 improves efficiency while still retaining generality
                 with the reconfigurability of an FPGA. A VGG-style
                 network with ternary weights and fixed point
                 activations is implemented for the CIFAR10 dataset on
                 Amazon's AWS F1 instance. This article demonstrates how
                 to remove 90\% of the operations in convolutional
                 layers by exploiting sparsity and compile-time
                 optimizations. The implementation in hardware achieves
                 90.9 \pm 0.1\% accuracy and 122k frames per second,
                 with a latency of only 29\micro s, which is the fastest
                 CNN inference implementation reported so far on an
                 FPGA.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "22",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Serre:2020:DBH,
  author =       "Fran{\c{c}}ois Serre and Markus P{\"u}schel",
  title =        "{DSL}-Based Hardware Generation with {Scala}: Example
                 {Fast Fourier Transforms} and Sorting Networks",
  journal =      j-TRETS,
  volume =       "13",
  number =       "1",
  pages =        "1:1--1:23",
  month =        feb,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3359754",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Thu Feb 6 08:37:52 MST 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3359754",
  abstract =     "We present a hardware generator for computations with
                 regular structure including the fast Fourier transform
                 (FFT), sorting networks, and others. The input of the
                 generator is a high-level description of the algorithm;
                 the output is a token-based, \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "1",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Alachiotis:2020:RXF,
  author =       "Nikolaos Alachiotis and Charalampos Vatsolakis and
                 Grigorios Chrysos and Dionisios Pnevmatikatos",
  title =        "{RAiSD-X}: a Fast and Accurate {FPGA} System for the
                 Detection of Positive Selection in Thousands of
                 Genomes",
  journal =      j-TRETS,
  volume =       "13",
  number =       "1",
  pages =        "2:1--2:30",
  month =        feb,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3364225",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Thu Feb 6 08:37:52 MST 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3364225",
  abstract =     "Detecting traces of positive selection in genomes
                 carries theoretical significance and has practical
                 applications from shedding light on the forces that
                 drive adaptive evolution to the design of more
                 effective drug treatments. The size of genomic
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "2",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Attia:2020:FFI,
  author =       "Sameh Attia and Vaughn Betz",
  title =        "Feel Free to Interrupt: Safe Task Stopping to Enable
                 {FPGA} Checkpointing and Context Switching",
  journal =      j-TRETS,
  volume =       "13",
  number =       "1",
  pages =        "3:1--3:27",
  month =        feb,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3372491",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Thu Feb 6 08:37:52 MST 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3372491",
  abstract =     "Saving and restoring an FPGA task state in an orderly
                 manner is essential to enable hardware checkpointing,
                 which is highly desirable to improve the ability to
                 debug cloud-scale hardware services, and context
                 switching, which allows multiple users to \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "3",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Jamal:2020:FTH,
  author =       "Al-Shahna Jamal and Eli Cahill and Jeffrey Goeders and
                 Steven J. E. Wilton",
  title =        "Fast Turnaround {HLS} Debugging Using Dependency
                 Analysis and Debug Overlays",
  journal =      j-TRETS,
  volume =       "13",
  number =       "1",
  pages =        "4:1--4:26",
  month =        feb,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3372490",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Thu Feb 6 08:37:52 MST 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3372490",
  abstract =     "High-level synthesis (HLS) has gained considerable
                 traction over recent years, as it allows for faster
                 development and verification of hardware accelerators
                 than traditional RTL design. While HLS allows for most
                 bugs to be caught during software \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "4",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Kourfali:2020:CDD,
  author =       "Alexandra Kourfali and Dirk Stroobandt",
  title =        "In-Circuit Debugging with Dynamic Reconfiguration of
                 {FPGA} Interconnects",
  journal =      j-TRETS,
  volume =       "13",
  number =       "1",
  pages =        "5:1--5:29",
  month =        feb,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3375459",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Thu Feb 6 08:37:52 MST 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3375459",
  abstract =     "In this work, a novel method for in-circuit debugging
                 on FPGAs is introduced that allows the insertion of
                 low-overhead debugging infrastructure by exploiting the
                 technique of parameterized configurations. This allows
                 the parameterization of the LUTs and \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "5",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Garg:2020:HNC,
  author =       "Tushar Garg and Saud Wasly and Rodolfo Pellizzoni and
                 Nachiket Kapre",
  title =        "{HopliteBuf}: Network Calculus-Based Design of {FPGA
                 NoCs} with Provably Stall-Free {FIFOs}",
  journal =      j-TRETS,
  volume =       "13",
  number =       "2",
  pages =        "6:1--6:35",
  month =        jun,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3375899",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Thu Jun 11 15:19:14 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3375899",
  abstract =     "HopliteBuf is a deflection-free, low-cost, and
                 high-speed FPGA overlay Network-on-chip (NoC) with
                 stall-free buffers. It is an FPGA-friendly 2D
                 unidirectional torus topology built on top of HopliteRT
                 overlay NoC. The stall-free buffers in HopliteBuf are
                 \ldots{}.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "6",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Fraser:2020:KNL,
  author =       "Nicholas J. Fraser and Philip H. W. Leong",
  title =        "Kernel Normalised Least Mean Squares with Delayed
                 Model Adaptation",
  journal =      j-TRETS,
  volume =       "13",
  number =       "2",
  pages =        "7:1--7:30",
  month =        jun,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3376924",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Thu Jun 11 15:19:14 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3376924",
  abstract =     "Kernel adaptive filters (KAFs) are non-linear filters
                 which can adapt temporally and have the additional
                 benefit of being computationally efficient through use
                 of the ``kernel trick''. In a number of real-world
                 applications, such as channel equalisation, \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "7",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Besta:2020:SCM,
  author =       "Maciej Besta and Marc Fischer and Tal Ben-Nun and
                 Dimitri Stanojevic and Johannes De Fine Licht and
                 Torsten Hoefler",
  title =        "Substream-Centric Maximum Matchings on {FPGA}",
  journal =      j-TRETS,
  volume =       "13",
  number =       "2",
  pages =        "8:1--8:33",
  month =        jun,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3377871",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Thu Jun 11 15:19:14 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3377871",
  abstract =     "Developing high-performance and energy-efficient
                 algorithms for maximum matchings is becoming
                 increasingly important in social network analysis,
                 computational sciences, scheduling, and others. In this
                 work, we propose the first maximum matching \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "8",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Murray:2020:VHP,
  author =       "Kevin E. Murray and Oleg Petelin and Sheng Zhong and
                 Jia Min Wang and Mohamed Eldafrawy and Jean-Philippe
                 Legault and Eugene Sha and Aaron G. Graham and Jean Wu
                 and Matthew J. P. Walker and Hanqing Zeng and
                 Panagiotis Patros and Jason Luu and Kenneth B. Kent and
                 Vaughn Betz",
  title =        "{VTR 8}: High-performance {CAD} and Customizable
                 {FPGA} Architecture Modelling",
  journal =      j-TRETS,
  volume =       "13",
  number =       "2",
  pages =        "9:1--9:55",
  month =        jun,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3388617",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Thu Jun 11 15:19:14 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3388617",
  abstract =     "Developing Field-programmable Gate Array (FPGA)
                 architectures is challenging due to the competing
                 requirements of various application domains and
                 changing manufacturing process technology. This is
                 compounded by the difficulty of fairly evaluating FPGA
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "9",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Delomier:2020:MBD,
  author =       "Yann Delomier and Bertrand {Le Gal} and Jer{\'e}mie
                 Crenne and Christophe Jego",
  title =        "Model-based Design of Hardware {SC} Polar Decoders for
                 {FPGAs}",
  journal =      j-TRETS,
  volume =       "13",
  number =       "2",
  pages =        "10:1--10:27",
  month =        jun,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3391431",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Thu Jun 11 15:19:14 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3391431",
  abstract =     "Polar codes are a new error correction code family
                 that should be benchmarked and evaluated in comparison
                 to LDPC and turbo-codes. Indeed, recent advances in the
                 5G digital communication standard recommended the use
                 of polar codes in EMBB control \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "10",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Shao:2020:PGF,
  author =       "Zhiyuan Shao and Chenhao Liu and Ruoshi Li and Xiaofei
                 Liao and Hai Jin",
  title =        "Processing Grid-format Real-world Graphs on
                 {DRAM}-based {FPGA} Accelerators with
                 Application-specific Caching Mechanisms",
  journal =      j-TRETS,
  volume =       "13",
  number =       "3",
  pages =        "11:1--11:33",
  month =        sep,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3391920",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Sep 5 18:51:36 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3391920",
  abstract =     "Graph processing is one of the important research
                 topics in the big-data era. To build a general
                 framework for graph processing by using a DRAM-based
                 FPGA board with deep memory hierarchy, one of the
                 reasonable methods is to partition a given big graph
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "11",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Eldafrawy:2020:FLB,
  author =       "Mohamed Eldafrawy and Andrew Boutros and Sadegh
                 Yazdanshenas and Vaughn Betz",
  title =        "{FPGA} Logic Block Architectures for Efficient Deep
                 Learning Inference",
  journal =      j-TRETS,
  volume =       "13",
  number =       "3",
  pages =        "12:1--12:34",
  month =        sep,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3393668",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Sep 5 18:51:36 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3393668",
  abstract =     "Reducing the precision of deep neural network (DNN)
                 inference accelerators can yield large efficiency gains
                 with little or no accuracy degradation compared to half
                 or single precision floating-point by enabling more
                 multiplication operations per unit \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "12",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Mu:2020:OOB,
  author =       "Jiandong Mu and Wei Zhang and Hao Liang and Sharad
                 Sinha",
  title =        "Optimizing {OpenCL}-Based {CNN} Design on {FPGA} with
                 Comprehensive Design Space Exploration and
                 Collaborative Performance Modeling",
  journal =      j-TRETS,
  volume =       "13",
  number =       "3",
  pages =        "13:1--13:28",
  month =        sep,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3397514",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Sep 5 18:51:36 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3397514",
  abstract =     "Recent success in applying convolutional neural
                 networks (CNNs) to object detection and classification
                 has sparked great interest in accelerating CNNs using
                 hardware-like field-programmable gate arrays (FPGAs).
                 However, finding an efficient FPGA design \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "13",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Sabogal:2020:RFE,
  author =       "Sebastian Sabogal and Alan George and Christopher
                 Wilson",
  title =        "Reconfigurable Framework for Environmentally Adaptive
                 Resilience in Hybrid Space Systems",
  journal =      j-TRETS,
  volume =       "13",
  number =       "3",
  pages =        "14:1--14:32",
  month =        sep,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3398380",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Sep 5 18:51:36 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3398380",
  abstract =     "Due to ongoing innovations in both sensor technology
                 and spacecraft autonomy, onboard space processing
                 continues to be outpaced by the escalating
                 computational demands required for next-generation
                 missions. Commercial-off-the-shelf, hybrid system-on-.
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "14",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{La:2020:FMS,
  author =       "Tuan Minh La and Kaspar Matas and Nikola Grunchevski
                 and Khoa Dang Pham and Dirk Koch",
  title =        "{FPGADefender}: Malicious Self-oscillator Scanning for
                 {Xilinx UltraScale} + {FPGAs}",
  journal =      j-TRETS,
  volume =       "13",
  number =       "3",
  pages =        "15:1--15:31",
  month =        sep,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3402937",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Sep 5 18:51:36 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3402937",
  abstract =     "Sharing configuration bitstreams rather than netlists
                 is a very desirable feature to protect IP or to share
                 IP without longer CAD tool processing times.
                 Furthermore, an increasing number of systems could
                 hugely benefit from serving multiple users on the
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "15",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Tang:2020:PSM,
  author =       "Qi Tang and Zhe Wang and Biao Guo and Li-Hua Zhu and
                 Ji-Bo Wei",
  title =        "Partitioning and Scheduling with Module Merging on
                 Dynamic Partial Reconfigurable {FPGAs}",
  journal =      j-TRETS,
  volume =       "13",
  number =       "3",
  pages =        "16:1--16:24",
  month =        sep,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3403702",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Sep 5 18:51:36 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3403702",
  abstract =     "Field programmable gate array (FPGA) is ubiquitous
                 nowadays and is applied to many areas. Dynamic partial
                 reconfiguration (DPR) is introduced to most modern
                 FPGAs, enabling changing the function of a part of the
                 FPGA by dynamically loading new \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "16",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Dehon:2020:ISS,
  author =       "Andr{\'e} Dehon",
  title =        "Introduction to Special Section on {FCCM 2019}",
  journal =      j-TRETS,
  volume =       "13",
  number =       "4",
  pages =        "17:1--17:2",
  month =        oct,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3410373",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Fri Oct 2 07:58:13 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3410373",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "17",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Zhou:2020:AFR,
  author =       "Yun Zhou and Dries Vercruyce and Dirk Stroobandt",
  title =        "Accelerating {FPGA} Routing Through Algorithmic
                 Enhancements and Connection-aware Parallelization",
  journal =      j-TRETS,
  volume =       "13",
  number =       "4",
  pages =        "18:1--18:26",
  month =        oct,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3406959",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Fri Oct 2 07:58:13 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3406959",
  abstract =     "Routing is a crucial step in Field Programmable Gate
                 Array (FPGA) physical design, as it determines the
                 routes of signals in the circuit, which impacts the
                 design implementation quality significantly. It can be
                 very time-consuming to successfully route \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "18",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Zhang:2020:MRB,
  author =       "Jialiang Zhang and Yue Zha and Nicholas Beckwith and
                 Bangya Liu and Jing Li",
  title =        "{MEG}: a {RISCV}-based System Emulation Infrastructure
                 for Near-data Processing Using {FPGAs} and
                 High-bandwidth Memory",
  journal =      j-TRETS,
  volume =       "13",
  number =       "4",
  pages =        "19:1--19:24",
  month =        oct,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3409114",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Fri Oct 2 07:58:13 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3409114",
  abstract =     "Emerging three-dimensional (3D) memory technologies,
                 such as the Hybrid Memory Cube (HMC) and High Bandwidth
                 Memory (HBM), provide high-bandwidth and massive
                 memory-level parallelism. With the growing
                 heterogeneity and complexity of computer systems
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "19",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Vaishnav:2020:FMF,
  author =       "Anuj Vaishnav and Khoa Dang Pham and Joseph Powell and
                 Dirk Koch",
  title =        "{FOS}: a Modular {FPGA} Operating System for Dynamic
                 Workloads",
  journal =      j-TRETS,
  volume =       "13",
  number =       "4",
  pages =        "20:1--20:28",
  month =        oct,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3405794",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Fri Oct 2 07:58:13 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3405794",
  abstract =     "With FPGAs now being deployed in the cloud and at the
                 edge, there is a need for scalable design methods that
                 can incorporate the heterogeneity present in the
                 hardware and software components of FPGA systems.
                 Moreover, these FPGA systems need to be \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "20",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Ioannou:2020:UNA,
  author =       "Aggelos D. Ioannou and Konstantinos Georgopoulos and
                 Pavlos Malakonakis and Dionisios N. Pnevmatikatos and
                 Vassilis D. Papaefstathiou and Ioannis Papaefstathiou
                 and Iakovos Mavroidis",
  title =        "{UNILOGIC}: a Novel Architecture for Highly Parallel
                 Reconfigurable Systems",
  journal =      j-TRETS,
  volume =       "13",
  number =       "4",
  pages =        "21:1--21:32",
  month =        oct,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3409115",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Fri Oct 2 07:58:13 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3409115",
  abstract =     "One of the main characteristics of High-performance
                 Computing (HPC) applications is that they become
                 increasingly performance and power demanding, pushing
                 HPC systems to their limits. Existing HPC systems have
                 not yet reached exascale performance mainly \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "21",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Zhang:2021:CHP,
  author =       "Xuzhi Zhang and Xiaozhe Shao and George Provelengios
                 and Naveen Kumar Dumpala and Lixin Gao and Russell
                 Tessier",
  title =        "{CoNFV}: a Heterogeneous Platform for Scalable Network
                 Function Virtualization",
  journal =      j-TRETS,
  volume =       "14",
  number =       "1",
  pages =        "1:1--1:29",
  month =        jan,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3409113",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Fri Jul 16 07:17:04 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3409113",
  abstract =     "Network function virtualization (NFV) is a powerful
                 networking approach that leverages computing resources
                 to perform a time-varying set of network processing
                 functions. Although microprocessors can be used for
                 this purpose, their performance limitations and lack of
                 specialization present implementation challenges. In
                 this article, we describe a new heterogeneous
                 hardware-software NFV platform called CoNFV that
                 provides scalability and programmability while
                 supporting significant hardware-level parallelism and
                 reconfiguration. Our computing platform takes advantage
                 of both field-programmable gate arrays (FPGAs) and
                 microprocessors to implement numerous virtual network
                 functions (VNF) that can be dynamically customized to
                 specific network flow needs. The most distinctive
                 feature of our system is the use of global network
                 state to coordinate NFV operations. Traffic management
                 and hardware reconfiguration functions are performed by
                 a global coordinator that allows for the rapid sharing
                 of network function states and continuous evaluation of
                 network function needs. With the help of state sharing
                 mechanism offered by the coordinator, customer-defined
                 VNF instances can be easily migrated between
                 heterogeneous middleboxes as the network environment
                 changes. A resource allocation and scheduling algorithm
                 dynamically assesses resource deployments as network
                 flows and conditions are updated. We show that our
                 deployment algorithm can successfully reallocate FPGA
                 and microprocessor resources in a fraction of a second
                 in response to changes in network flow capacity and
                 network security threats including intrusion.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "1",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Beasley:2021:OCH,
  author =       "Alexander E. Beasley and C. T. Clarke and R. J.
                 Watson",
  title =        "An {OpenGL} Compliant Hardware Implementation of a
                 Graphic Processing Unit Using Field Programmable Gate
                 Array-System on Chip Technology",
  journal =      j-TRETS,
  volume =       "14",
  number =       "1",
  pages =        "2:1--2:24",
  month =        jan,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3410357",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Fri Jul 16 07:17:04 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3410357",
  abstract =     "FPGA-SoC technology provides a heterogeneous platform
                 for advanced, high-performance systems. The System on
                 Chip (SoC) architecture combines traditional single and
                 multiple core processor topologies with flexible FPGA
                 fabric. Dynamic reconfiguration \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "2",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Kara:2021:PGC,
  author =       "Kaan Kara and Gustavo Alonso",
  title =        "{PipeArch}: Generic and Context-Switch Capable Data
                 Processing on {FPGAs}",
  journal =      j-TRETS,
  volume =       "14",
  number =       "1",
  pages =        "3:1--3:28",
  month =        jan,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3418465",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Fri Jul 16 07:17:04 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3418465",
  abstract =     "Data processing systems based on FPGAs offer high
                 performance and energy efficiency for a variety of
                 applications. However, these advantages are achieved
                 through highly specialized designs. The high degree of
                 specialization leads to accelerators with \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "3",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Mohajer:2021:PUC,
  author =       "Soheil Mohajer and Zhiheng Wang and Kia Bazargan and
                 Yuyang Li",
  title =        "Parallel Unary Computing Based on Function
                 Derivatives",
  journal =      j-TRETS,
  volume =       "14",
  number =       "1",
  pages =        "4:1--4:25",
  month =        jan,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3418464",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Fri Jul 16 07:17:04 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3418464",
  abstract =     "The binary number representation has dominated digital
                 logic for decades due to its compact storage
                 requirements. An alternative representation is the
                 unary number system: We use N bits, from which the
                 first M are 1 and the rest are 0 to represent the
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "4",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Kyparissas:2021:LSC,
  author =       "Nikolaos Kyparissas and Apostolos Dollas",
  title =        "Large-scale Cellular Automata on {FPGAs}: a New
                 Generic Architecture and a Framework",
  journal =      j-TRETS,
  volume =       "14",
  number =       "1",
  pages =        "5:1--5:32",
  month =        jan,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3423185",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Fri Jul 16 07:17:04 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3423185",
  abstract =     "Cellular automata (CA) are discrete mathematical
                 models discovered in the 1940s by John von Neumann and
                 Stanislaw Ulam and have been used extensively in many
                 scientific disciplines ever since. The present work
                 evolved from a Field Programmable Gate \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "5",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Peetermans:2021:DAC,
  author =       "Adriaan Peetermans and Vladimir Rozi{\'c} and Ingrid
                 Verbauwhede",
  title =        "Design and Analysis of Configurable Ring Oscillators
                 for True Random Number Generation Based on Coherent
                 Sampling",
  journal =      j-TRETS,
  volume =       "14",
  number =       "2",
  pages =        "7:1--7:20",
  month =        jul,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3433166",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Fri Jul 16 07:17:05 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/prng.bib;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3433166",
  abstract =     "True Random Number Generators (TRNGs) are
                 indispensable in modern cryptosystems. Unfortunately,
                 to guarantee high entropy of the generated numbers,
                 many TRNG designs require a complex implementation
                 procedure, often involving manual placement and
                 routing. In this work, we introduce, analyse, and
                 compare three dynamic calibration mechanisms for the
                 COherent Sampling ring Oscillator based TRNG: GateVar,
                 WireVar, and LUTVar, enabling easy integration of the
                 entropy source into complex systems. The TRNG setup
                 procedure automatically selects a configuration that
                 guarantees the security requirements. In the
                 experiments, we show that two out of the three proposed
                 mechanisms are capable of assuring correct TRNG
                 operation even when an automatic placement is carried
                 out and when the design is ported to another
                 Field-Programmable Gate Array (FPGA) family. We
                 generated random bits on both a Xilinx Spartan 7 and a
                 Microsemi SmartFusion2 implementation that, without
                 post processing, passed the AIS-31 statistical tests at
                 a throughput of 4.65 Mbit/s and 1.47 Mbit/s,
                 respectively.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "7",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Cho:2021:PMC,
  author =       "Shenghsun Cho and Mrunal Patel and Michael Ferdman and
                 Peter Milder",
  title =        "Practical Model Checking on {FPGAs}",
  journal =      j-TRETS,
  volume =       "14",
  number =       "2",
  pages =        "8:1--8:18",
  month =        jul,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3448272",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Fri Jul 16 07:17:05 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3448272",
  abstract =     "Software verification is an important stage of the
                 software development process, particularly for
                 mission-critical systems. As the traditional
                 methodology of using unit tests falls short of
                 verifying complex software, developers are increasingly
                 relying \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "8",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Ma:2021:SFP,
  author =       "Rui Ma and Jia-Ching Hsu and Tian Tan and Eriko
                 Nurvitadhi and David Sheffield and Rob Pelt and Martin
                 Langhammer and Jaewoong Sim and Aravind Dasu and Derek
                 Chiou",
  title =        "Specializing {FGPU} for Persistent Deep Learning",
  journal =      j-TRETS,
  volume =       "14",
  number =       "2",
  pages =        "10:1--10:23",
  month =        jul,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3457886",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Fri Jul 16 07:17:05 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3457886",
  abstract =     "Overlay architectures are a good way to enable fast
                 development and debug on FPGAs at the expense of
                 potentially limited performance compared to fully
                 customized FPGA designs. When used in concert with
                 hand-tuned FPGA solutions, performant overlay
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "10",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Zhou:2021:SHC,
  author =       "Zhen Zhou and Debiao He and Zhe Liu and Min Luo and
                 Kim-Kwang Raymond Choo",
  title =        "A Software\slash Hardware Co-Design of
                 Crystals-Dilithium Signature Scheme",
  journal =      j-TRETS,
  volume =       "14",
  number =       "2",
  pages =        "11:1--11:21",
  month =        jul,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3447812",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Fri Jul 16 07:17:05 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/cryptography2020.bib;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3447812",
  abstract =     "As quantum computers become more affordable and
                 commonplace, existing security systems that are based
                 on classical cryptographic primitives, such as RSA and
                 Elliptic Curve Cryptography (ECC), will no longer be
                 secure. Hence, there has been interest in designing
                 post-quantum cryptographic (PQC) schemes, such as those
                 based on lattice-based cryptography (LBC). The
                 potential of LBC schemes is evidenced by the number of
                 such schemes passing the selection of NIST PQC
                 Standardization Process Round-3. One such scheme is the
                 Crystals-Dilithium signature scheme, which is based on
                 the hard module-lattice problem. However, there is no
                 efficient implementation of the Crystals-Dilithium
                 signature scheme. Hence, in this article, we present a
                 compact hardware architecture containing elaborate
                 modular multiplication units using the Karatsuba
                 algorithm along with smart generators of address
                 sequence and twiddle factors for NTT, which can
                 complete polynomial addition/multiplication with the
                 parameter setting of Dilithium in a short clock period.
                 Also, we propose a fast software/hardware co-design
                 implementation on Field Programmable Gate Array (FPGA)
                 for the Dilithium scheme with a tradeoff between speed
                 and resource utilization. Our co-design implementation
                 outperforms a pure C implementation on a Nios-II
                 processor of the platform Altera DE2-115, in the sense
                 that our implementation is 11.2 and 7.4 times faster
                 for signature and verification, respectively. In
                 addition, we also achieve approximately 51\% and 31\%
                 speed improvement for signature and verification, in
                 comparison to the pure C implementation on processor
                 ARM Cortex-A9 of ZYNQ-7020 platform.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "11",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Yasudo:2021:APE,
  author =       "Ryota Yasudo and Jos{\'e} G. F. Coutinho and Ana-Lucia
                 Varbanescu and Wayne Luk and Hideharu Amano and Tobias
                 Becker and Ce Guo",
  title =        "Analytical Performance Estimation for Large-Scale
                 Reconfigurable Dataflow Platforms",
  journal =      j-TRETS,
  volume =       "14",
  number =       "3",
  pages =        "12:1--12:21",
  month =        sep,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3452742",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Aug 21 07:50:22 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3452742",
  abstract =     "Next-generation high-performance computing platforms
                 will handle extreme data- and compute-intensive
                 problems that are intractable with today's technology.
                 A promising path in achieving the next leap in
                 high-performance computing is to embrace \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "12",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Taka:2021:PVA,
  author =       "Endri Taka and Konstantinos Maragos and George
                 Lentaris and Dimitrios Soudris",
  title =        "Process Variability Analysis in Interconnect, Logic,
                 and Arithmetic Blocks of 16-nm {FinFET FPGAs}",
  journal =      j-TRETS,
  volume =       "14",
  number =       "3",
  pages =        "13:1--13:30",
  month =        sep,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3458843",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Aug 21 07:50:22 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3458843",
  abstract =     "In the current work, we study the process variability
                 of logic, interconnect, and arithmetic/DSP resources in
                 commercial 16-nm FPGAs. We create multiple, soft-macro
                 sensors for each distinct resource under evaluation,
                 and we deploy them across the FPGA \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "13",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Sasongko:2021:HCS,
  author =       "Arif Sasongko and I. M. Narendra Kumara and Arief
                 Wicaksana and Fr{\'e}d{\'e}ric Rousseau and Olivier
                 Muller",
  title =        "Hardware Context Switch-based Cryptographic
                 Accelerator for Handling Multiple Streams",
  journal =      j-TRETS,
  volume =       "14",
  number =       "3",
  pages =        "14:1--14:25",
  month =        sep,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3460941",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Aug 21 07:50:22 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3460941",
  abstract =     "The confidentiality and integrity of a stream has
                 become one of the biggest issues in telecommunication.
                 The best available algorithm handling the
                 confidentiality of a data stream is the symmetric key
                 block cipher combined with a chaining mode of
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "14",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Reggiani:2021:ESM,
  author =       "Enrico Reggiani and Emanuele {Del Sozzo} and Davide
                 Conficconi and Giuseppe Natale and Carlo Moroni and
                 Marco D. Santambrogio",
  title =        "Enhancing the Scalability of Multi-{FPGA} Stencil
                 Computations via Highly Optimized {HDL} Components",
  journal =      j-TRETS,
  volume =       "14",
  number =       "3",
  pages =        "15:1--15:33",
  month =        sep,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3461478",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Aug 21 07:50:22 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3461478",
  abstract =     "Stencil-based algorithms are a relevant class of
                 computational kernels in high-performance systems, as
                 they appear in a plethora of fields, from image
                 processing to seismic simulations, from numerical
                 methods to physical modeling. Among the various
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "15",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Al-Hyari:2021:DLF,
  author =       "Abeer Al-Hyari and Hannah Szentimrey and Ahmed Shamli
                 and Timothy Martin and Gary Gr{\'e}wal and Shawki
                 Areibi",
  title =        "A Deep Learning Framework to Predict Routability for
                 {FPGA} Circuit Placement",
  journal =      j-TRETS,
  volume =       "14",
  number =       "3",
  pages =        "16:1--16:28",
  month =        sep,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3465373",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Aug 21 07:50:22 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3465373",
  abstract =     "The ability to accurately and efficiently estimate the
                 routability of a circuit based on its placement is one
                 of the most challenging and difficult tasks in the
                 Field Programmable Gate Array (FPGA) flow. In this
                 article, we present a novel, deep learning \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "16",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Lai:2021:PSS,
  author =       "Yi-Hsiang Lai and Ecenur Ustun and Shaojie Xiang and
                 Zhenman Fang and Hongbo Rong and Zhiru Zhang",
  title =        "Programming and Synthesis for Software-defined {FPGA}
                 Acceleration: Status and Future Prospects",
  journal =      j-TRETS,
  volume =       "14",
  number =       "4",
  pages =        "17:1--17:39",
  month =        dec,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3469660",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Sep 21 07:21:30 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3469660",
  abstract =     "FPGA-based accelerators are increasingly popular
                 across a broad range of applications, because they
                 offer massive parallelism, high energy efficiency, and
                 great flexibility for customizations. However,
                 difficulties in programming and integrating FPGAs
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "17",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Yang:2021:BWB,
  author =       "Tao Yang and Zhezhi He and Tengchuan Kou and Qingzheng
                 Li and Qi Han and Haibao Yu and Fangxin Liu and Yun
                 Liang and Li Jiang",
  title =        "{BISWSRBS}: a {Winograd}-based {CNN} Accelerator with
                 a Fine-grained Regular Sparsity Pattern and Mixed
                 Precision Quantization",
  journal =      j-TRETS,
  volume =       "14",
  number =       "4",
  pages =        "18:1--18:28",
  month =        dec,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3467476",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Sep 21 07:21:30 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3467476",
  abstract =     "Field-programmable Gate Array (FPGA) is a
                 high-performance computing platform for Convolution
                 Neural Networks (CNNs) inference. Winograd algorithm,
                 weight pruning, and quantization are widely adopted to
                 reduce the storage and arithmetic overhead of CNNs
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "18",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Wijtvliet:2021:CER,
  author =       "Mark Wijtvliet and Henk Corporaal and Akash Kumar",
  title =        "{CGRA-EAM-Rapid} Energy and Area Estimation for
                 Coarse-grained Reconfigurable Architectures",
  journal =      j-TRETS,
  volume =       "14",
  number =       "4",
  pages =        "19:1--19:28",
  month =        dec,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3468874",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Sep 21 07:21:30 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3468874",
  abstract =     "Reconfigurable architectures are quickly gaining in
                 popularity due to their flexibility and ability to
                 provide high energy efficiency. However, reconfigurable
                 systems allow for a huge design space. Iterative design
                 space exploration (DSE) is often \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "19",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Gu:2021:DGB,
  author =       "Zhenghua Gu and Wenqing Wan and Jundong Xie and Chang
                 Wu",
  title =        "Dependency Graph-based High-level Synthesis for
                 Maximum Instruction Parallelism",
  journal =      j-TRETS,
  volume =       "14",
  number =       "4",
  pages =        "20:1--20:15",
  month =        dec,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3468875",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Sep 21 07:21:30 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3468875",
  abstract =     "Performance optimization is an important goal for
                 High-level Synthesis (HLS). Existing HLS scheduling
                 algorithms are all based on Control and Data Flow Graph
                 (CDFG) and will schedule basic blocks in sequential
                 order. Our study shows that the sequential \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "20",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Hung:2021:AGF,
  author =       "Jos{\'e} Romero Hung and Chao Li and Pengyu Wang and
                 Chuanming Shao and Jinyang Guo and Jing Wang and
                 Guoyong Shi",
  title =        "{ACE-GCN}: a Fast Data-driven {FPGA} Accelerator for
                 {GCN} Embedding",
  journal =      j-TRETS,
  volume =       "14",
  number =       "4",
  pages =        "21:1--21:23",
  month =        dec,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3470536",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Sep 21 07:21:30 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3470536",
  abstract =     "ACE-GCN is a fast and resource/energy-efficient FPGA
                 accelerator for graph convolutional embedding under
                 data-driven and in-place processing conditions. Our
                 accelerator exploits the inherent power law
                 distribution and high sparsity commonly exhibited by
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "21",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Sabogal:2021:RFR,
  author =       "Sebastian Sabogal and Alan George and Gary Crum",
  title =        "Reconfigurable Framework for Resilient Semantic
                 Segmentation for Space Applications",
  journal =      j-TRETS,
  volume =       "14",
  number =       "4",
  pages =        "22:1--22:32",
  month =        dec,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3472770",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Sep 21 07:21:30 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3472770",
  abstract =     "Deep learning (DL) presents new opportunities for
                 enabling spacecraft autonomy, onboard analysis, and
                 intelligent applications for space missions. However,
                 DL applications are computationally intensive and often
                 infeasible to deploy on radiation-hardened \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "22",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Shannon:2022:ISS,
  author =       "Lesley Shannon",
  title =        "Introduction to Special Section on {FPGA 2020}",
  journal =      j-TRETS,
  volume =       "15",
  number =       "1",
  pages =        "1:1--1:2",
  month =        mar,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3485586",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Fri Jan 28 07:03:50 MST 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3485586",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "1",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Rybalkin:2022:WMG,
  author =       "Vladimir Rybalkin and Jonas Ney and Menbere Kina
                 Tekleyohannes and Norbert Wehn",
  title =        "When Massive {GPU} Parallelism Ain't Enough: a Novel
                 Hardware Architecture of {$2$D-LSTM} Neural Network",
  journal =      j-TRETS,
  volume =       "15",
  number =       "1",
  pages =        "2:1--2:35",
  month =        mar,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3469661",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Fri Jan 28 07:03:50 MST 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3469661",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "2",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Papaphilippou:2022:HHP,
  author =       "Philippos Papaphilippou and Jiuxi Meng and Nadeen
                 Gebara and Wayne Luk",
  title =        "{Hipernetch}: High-Performance {FPGA} Network Switch",
  journal =      j-TRETS,
  volume =       "15",
  number =       "1",
  pages =        "3:1--3:31",
  month =        mar,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3477054",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Fri Jan 28 07:03:50 MST 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3477054",
  abstract =     "We present Hipernetch, a novel FPGA-based design for
                 performing high-bandwidth network switching. FPGAs have
                 recently become more popular in data centers due to
                 their promising capabilities for a wide range of
                 applications. With the recent surge in \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "3",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Josipovic:2022:BPS,
  author =       "Lana Josipovi{\'c} and Shabnam Sheikhha and Andrea
                 Guerrieri and Paolo Ienne and Jordi Cortadella",
  title =        "Buffer Placement and Sizing for High-Performance
                 Dataflow Circuits",
  journal =      j-TRETS,
  volume =       "15",
  number =       "1",
  pages =        "4:1--4:32",
  month =        mar,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3477053",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Fri Jan 28 07:03:50 MST 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3477053",
  abstract =     "Commercial high-level synthesis tools typically
                 produce statically scheduled circuits. Yet, effective
                 C-to-circuit conversion of arbitrary software
                 applications calls for dataflow circuits, as they can
                 handle efficiently variable latencies (e.g., caches),
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "4",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Gross:2022:ESF,
  author =       "Mathieu Gross and Konrad Hohentanner and Stefan
                 Wiehler and Georg Sigl",
  title =        "Enhancing the Security of {FPGA-SoCs} via the Usage of
                 {ARM TrustZone} and a Hybrid-{TPM}",
  journal =      j-TRETS,
  volume =       "15",
  number =       "1",
  pages =        "5:1--5:26",
  month =        mar,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3472959",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Fri Jan 28 07:03:50 MST 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3472959",
  abstract =     "Isolated execution is a concept commonly used for
                 increasing the security of a computer system. In the
                 embedded world, ARM TrustZone technology enables this
                 goal and is currently used on mobile devices for
                 applications such as secure payment or biometric
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "5",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Wu:2022:LPF,
  author =       "Chen Wu and Mingyu Wang and Xinyuan Chu and Kun Wang
                 and Lei He",
  title =        "Low-precision Floating-point Arithmetic for
                 High-performance {FPGA}-based {CNN} Acceleration",
  journal =      j-TRETS,
  volume =       "15",
  number =       "1",
  pages =        "6:1--6:21",
  month =        mar,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3474597",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Fri Jan 28 07:03:50 MST 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/fparith.bib;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3474597",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "6",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Chen:2022:NTE,
  author =       "Deming Chen",
  title =        "Note from the {TRETS EiC} about the new Journal-first
                 track in {FPT'21}",
  journal =      j-TRETS,
  volume =       "15",
  number =       "1",
  pages =        "7e:1--7e:1",
  month =        mar,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3501280",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Fri Jan 28 07:03:50 MST 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3501280",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "7e",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Streit:2022:DET,
  author =       "Franz-Josef Streit and Paul Kr{\"u}ger and Andreas
                 Becher and Stefan Wildermann and J{\"u}rgen Teich",
  title =        "Design and Evaluation of a Tunable {PUF} Architecture
                 for {FPGAs}",
  journal =      j-TRETS,
  volume =       "15",
  number =       "1",
  pages =        "7:1--7:27",
  month =        mar,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3491237",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Fri Jan 28 07:03:50 MST 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3491237",
  abstract =     "FPGA-based Physical Unclonable Functions (PUF) have
                 emerged as a viable alternative to permanent key
                 storage by turning effects of inaccuracies during the
                 manufacturing process of a chip into a unique,
                 FPGA-intrinsic secret. However, many fixed PUF
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "7",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Zhou:2022:ROS,
  author =       "Yun Zhou and Pongstorn Maidee and Chris Lavin and
                 Alireza Kaviani and Dirk Stroobandt",
  title =        "{RWRoute}: an Open-source Timing-driven Router for
                 Commercial {FPGAs}",
  journal =      j-TRETS,
  volume =       "15",
  number =       "1",
  pages =        "8:1--8:27",
  month =        mar,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3491236",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Fri Jan 28 07:03:50 MST 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3491236",
  abstract =     "One of the key obstacles to pervasive deployment of
                 FPGA accelerators in data centers is their cumbersome
                 programming model. Open source tooling is suggested as
                 a way to develop alternative EDA tools to remedy this
                 issue. Open source FPGA CAD tools have \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "8",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Rasoulinezhad:2022:REB,
  author =       "Seyedramin Rasoulinezhad and Esther Roorda and Steve
                 Wilton and Philip H. W. Leong and David Boland",
  title =        "Rethinking Embedded Blocks for Machine Learning
                 Applications",
  journal =      j-TRETS,
  volume =       "15",
  number =       "1",
  pages =        "9:1--9:30",
  month =        mar,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3491234",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Fri Jan 28 07:03:50 MST 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3491234",
  abstract =     "The underlying goal of FPGA architecture research is
                 to devise flexible substrates that implement a wide
                 variety of circuits efficiently. Contemporary FPGA
                 architectures have been optimized to support
                 networking, signal processing, and image processing
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "9",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Menzel:2022:SSA,
  author =       "Johannes Menzel and Christian Plessl and Tobias
                 Kenter",
  title =        "The Strong Scaling Advantage of {FPGAs} in {HPC} for
                 {$N$}-body Simulations",
  journal =      j-TRETS,
  volume =       "15",
  number =       "1",
  pages =        "10:1--10:30",
  month =        mar,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3491235",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Fri Jan 28 07:03:50 MST 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3491235",
  abstract =     "N-body methods are one of the essential algorithmic
                 building blocks of high-performance and parallel
                 computing. Previous research has shown promising
                 performance for implementing n-body simulations with
                 pairwise force calculations on FPGAs. However, to
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "10",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Eguro:2022:ISIa,
  author =       "Ken Eguro and Stephen Neuendorffer and Viktor Prasanna
                 and Hongbo Rong",
  title =        "Introduction to Special Issue on {FPGAs} in Data
                 Centers",
  journal =      j-TRETS,
  volume =       "15",
  number =       "2",
  pages =        "11:1--11:2",
  month =        jun,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3493607",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Wed Mar 2 08:59:34 MST 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3493607",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "11",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Keller:2022:ITR,
  author =       "Andrew M. Keller and Michael J. Wirthlin",
  title =        "The Impact of Terrestrial Radiation on {FPGAs} in Data
                 Centers",
  journal =      j-TRETS,
  volume =       "15",
  number =       "2",
  pages =        "12:1--12:21",
  month =        jun,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3457198",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Wed Mar 2 08:59:34 MST 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3457198",
  abstract =     "Field programmable gate arrays (FPGAs) are used in
                 large numbers in data centers around the world. They
                 are used for cloud computing and computer networking.
                 The most common type of FPGA used in data centers are
                 re-programmable SRAM-based FPGAs. These \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "12",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Asiatici:2022:RCS,
  author =       "Mikhail Asiatici and Paolo Ienne",
  title =        "Request, Coalesce, Serve, and Forget: Miss-Optimized
                 Memory Systems for Bandwidth-Bound Cache-Unfriendly
                 Applications on {FPGAs}",
  journal =      j-TRETS,
  volume =       "15",
  number =       "2",
  pages =        "13:1--13:33",
  month =        jun,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3466823",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Wed Mar 2 08:59:34 MST 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3466823",
  abstract =     "Applications such as large-scale sparse linear algebra
                 and graph analytics are challenging to accelerate on
                 FPGAs due to the short irregular memory accesses,
                 resulting in low cache hit rates. Nonblocking caches
                 reduce the bandwidth required by misses by \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "13",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Dogan:2022:CBB,
  author =       "Atakan Dogan and Kemal Ebcioglu",
  title =        "Cloud Building Block Chip for Creating {FPGA} and
                 {ASIC} Clouds",
  journal =      j-TRETS,
  volume =       "15",
  number =       "2",
  pages =        "14:1--14:35",
  month =        jun,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3466822",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Wed Mar 2 08:59:34 MST 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3466822",
  abstract =     "Hardware-accelerated cloud computing systems based on
                 FPGA chips (FPGA cloud) or ASIC chips (ASIC cloud) have
                 emerged as a new technology trend for power-efficient
                 acceleration of various software applications. However,
                 the operating systems and \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "14",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Alonso:2022:EDS,
  author =       "Tobias Alonso and Lucian Petrica and Mario Ruiz and
                 Jakoba Petri-Koenig and Yaman Umuroglu and Ioannis
                 Stamelos and Elias Koromilas and Michaela Blott and
                 Kees Vissers",
  title =        "{Elastic-DF}: Scaling Performance of {DNN} Inference
                 in {FPGA} Clouds through Automatic Partitioning",
  journal =      j-TRETS,
  volume =       "15",
  number =       "2",
  pages =        "15:1--15:34",
  month =        jun,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3470567",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Wed Mar 2 08:59:34 MST 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3470567",
  abstract =     "Customized compute acceleration in the datacenter is
                 key to the wider roll-out of applications based on deep
                 neural network (DNN) inference. In this article, we
                 investigate how to maximize the performance and
                 scalability of field-programmable gate array \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "15",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Salamat:2022:NGN,
  author =       "Sahand Salamat and Hui Zhang and Yang Seok Ki and
                 Tajana Rosing",
  title =        "\pkg{NASCENT2}: Generic Near-Storage Sort Accelerator
                 for Data Analytics on {SmartSSD}",
  journal =      j-TRETS,
  volume =       "15",
  number =       "2",
  pages =        "16:1--16:29",
  month =        jun,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3472769",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Wed Mar 2 08:59:34 MST 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3472769",
  abstract =     "As the size of data generated every day grows
                 dramatically, the computational bottleneck of computer
                 systems has shifted toward storage devices. The
                 interface between the storage and the computational
                 platforms has become the main limitation due to its
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "16",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Damiani:2022:BFS,
  author =       "Andrea Damiani and Giorgia Fiscaletti and Marco Bacis
                 and Rolando Brondolin and Marco D. Santambrogio",
  title =        "\pkg{BlastFunction}: a Full-stack Framework Bringing
                 {FPGA} Hardware Acceleration to Cloud-native
                 Applications",
  journal =      j-TRETS,
  volume =       "15",
  number =       "2",
  pages =        "17:1--17:27",
  month =        jun,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3472958",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Wed Mar 2 08:59:34 MST 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3472958",
  abstract =     "``Cloud-native'' is the umbrella adjective describing
                 the standard approach for developing applications that
                 exploit cloud infrastructures' scalability and
                 elasticity at their best. As the application complexity
                 and user-bases grow, designing for \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "17",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{DAlberto:2022:XID,
  author =       "Paolo D'Alberto and Victor Wu and Aaron Ng and Rahul
                 Nimaiyar and Elliott Delaye and Ashish Sirasao",
  title =        "\pkg{xDNN}: Inference for Deep Convolutional Neural
                 Networks",
  journal =      j-TRETS,
  volume =       "15",
  number =       "2",
  pages =        "18:1--18:29",
  month =        jun,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3473334",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Wed Mar 2 08:59:34 MST 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3473334",
  abstract =     "We present xDNN, an end-to-end system for
                 deep-learning inference based on a family of
                 specialized hardware processors synthesized on
                 Field-Programmable Gate Array (FPGAs) and Convolution
                 Neural Networks (CNN). We present a design optimized
                 for low \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "18",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Mbongue:2022:DMT,
  author =       "Joel Mandebi Mbongue and Danielle Tchuinkou Kwadjo and
                 Alex Shuping and Christophe Bobda",
  title =        "Deploying Multi-tenant {FPGAs} within {Linux}-based
                 Cloud Infrastructure",
  journal =      j-TRETS,
  volume =       "15",
  number =       "2",
  pages =        "19:1--19:31",
  month =        jun,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3474058",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Wed Mar 2 08:59:34 MST 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/linux.bib;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib;
                 https://www.math.utah.edu/pub/tex/bib/unix.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3474058",
  abstract =     "Cloud deployments now increasingly exploit
                 Field-Programmable Gate Array (FPGA) accelerators as
                 part of virtual instances. While cloud FPGAs are still
                 essentially single-tenant, the growing demand for
                 efficient hardware acceleration paves the way to FPGA
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "19",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Hogervorst:2022:HAH,
  author =       "Tom Hogervorst and Razvan Nane and Giacomo Marchiori
                 and Tong Dong Qiu and Markus Blatt and Alf Birger
                 Rustad",
  title =        "Hardware Acceleration of High-Performance
                 Computational Flow Dynamics Using High-Bandwidth
                 Memory-Enabled Field-Programmable Gate Arrays",
  journal =      j-TRETS,
  volume =       "15",
  number =       "2",
  pages =        "20:1--20:35",
  month =        jun,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3476229",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Wed Mar 2 08:59:34 MST 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3476229",
  abstract =     "Scientific computing is at the core of many
                 High-Performance Computing applications, including
                 computational flow dynamics. Because of the utmost
                 importance to simulate increasingly larger
                 computational models, hardware acceleration is
                 receiving increased \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "20",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Sun:2022:BEC,
  author =       "Gongjin Sun and Seongyoung Kang and Sang-Woo Jun",
  title =        "\pkg{BurstZ+}: Eliminating The Communication
                 Bottleneck of Scientific Computing Accelerators via
                 Accelerated Compression",
  journal =      j-TRETS,
  volume =       "15",
  number =       "2",
  pages =        "21:1--21:34",
  month =        jun,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3476831",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Wed Mar 2 08:59:34 MST 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3476831",
  abstract =     "We present BurstZ+, an accelerator platform that
                 eliminates the communication bottleneck between
                 PCIe-attached scientific computing accelerators and
                 their host servers, via hardware-optimized compression.
                 While accelerators such as GPUs and FPGAs provide
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "21",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Eguro:2022:ISIb,
  author =       "Ken Eguro and Stephen Neuendorffer and Viktor Prasanna
                 and Hongbo Rong",
  title =        "Introduction to Special Issue on {FPGAs} in Data
                 Centers, {Part II}",
  journal =      j-TRETS,
  volume =       "15",
  number =       "3",
  pages =        "22:1--22:2",
  month =        sep,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3495231",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue May 24 07:29:32 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3495231",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "22",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Tarafdar:2022:AOF,
  author =       "Naif Tarafdar and Giuseppe {Di Guglielmo} and Philip
                 C. Harris and Jeffrey D. Krupa and Vladimir Loncar and
                 Dylan S. Rankin and Nhan Tran and Zhenbin Wu and
                 Qianfeng Shen and Paul Chow",
  title =        "{AIgean}: an Open Framework for Deploying Machine
                 Learning on Heterogeneous Clusters",
  journal =      j-TRETS,
  volume =       "15",
  number =       "3",
  pages =        "23:1--23:32",
  month =        sep,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3482854",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue May 24 07:29:32 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3482854",
  abstract =     "AIgean, pronounced like the sea, is an open framework
                 to build and deploy machine learning (ML) algorithms on
                 a heterogeneous cluster of devices (CPUs and FPGAs). We
                 leverage two open source projects: Galapagos, for
                 multi-FPGA deployment, and hls4ml, for \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "23",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Zeng:2022:UFV,
  author =       "Shulin Zeng and Guohao Dai and Hanbo Sun and Jun Liu
                 and Shiyao Li and Guangjun Ge and Kai Zhong and Kaiyuan
                 Guo and Yu Wang and Huazhong Yang",
  title =        "A Unified {FPGA} Virtualization Framework for
                 General-Purpose Deep Neural Networks in the Cloud",
  journal =      j-TRETS,
  volume =       "15",
  number =       "3",
  pages =        "24:1--24:31",
  month =        sep,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3480170",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue May 24 07:29:32 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3480170",
  abstract =     "INFerence-as-a-Service (INFaaS) has become a primary
                 workload in the cloud. However, existing FPGA-based
                 Deep Neural Network (DNN) accelerators are mainly
                 optimized for the fastest speed of a single task, while
                 the multi-tenancy of INFaaS has not been explored yet.
                 As the demand for INFaaS keeps growing, simply
                 increasing the number of FPGA-based DNN accelerators is
                 not cost-effective, while merely sharing these
                 single-task optimized DNN accelerators in a
                 time-division multiplexing way could lead to poor
                 isolation and high-performance loss for INFaaS. On the
                 other hand, current cloud-based DNN accelerators have
                 excessive compilation overhead, especially when scaling
                 out to multi-FPGA systems for multi-tenant sharing,
                 leading to unacceptable compilation costs for both
                 offline deployment and online reconfiguration.
                 Therefore, it is far from providing efficient and
                 flexible FPGA virtualization for public and private
                 cloud scenarios.\par

                 Aiming to solve these problems, we propose a unified
                 virtualization framework for general-purpose deep
                 neural networks in the cloud, enabling multi-tenant
                 sharing for both the Convolution Neural Network (CNN),
                 and the Recurrent Neural Network (RNN) accelerators on
                 a single FPGA. The isolation is enabled by introducing
                 a two-level instruction dispatch module and a
                 multi-core based hardware resources pool. Such designs
                 provide isolated and runtime-programmable hardware
                 resources, which further leads to performance isolation
                 for multi-tenant sharing. On the other hand, to
                 overcome the heavy re-compilation overheads, a
                 tiling-based instruction frame package design and a
                 two-stage static-dynamic compilation, are proposed.
                 Only the lightweight runtime information is re-compiled
                 with $ \approx $1 ms overhead, thus guaranteeing the
                 private cloud's performance. Finally, the extensive
                 experimental results show that the proposed virtualized
                 solutions achieve up to $ 3.12 \times $ and $ 6.18
                 \times $ higher throughput in the private cloud
                 compared with the static CNN and RNN baseline designs,
                 respectively.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "24",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Alachiotis:2022:SPR,
  author =       "Nikolaos Alachiotis and Panagiotis Skrimponis and
                 Manolis Pissadakis and Dionisios Pnevmatikatos",
  title =        "Scalable Phylogeny Reconstruction with Disaggregated
                 Near-memory Processing",
  journal =      j-TRETS,
  volume =       "15",
  number =       "3",
  pages =        "25:1--25:32",
  month =        sep,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3484983",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue May 24 07:29:32 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3484983",
  abstract =     "Disaggregated computer architectures eliminate
                 resource fragmentation in next-generation datacenters
                 by enabling virtual machines to employ resources such
                 as CPUs, memory, and accelerators that are physically
                 located on different servers. While this paves
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "25",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Brennsteiner:2022:RTD,
  author =       "Stefan Brennsteiner and Tughrul Arslan and John
                 Thompson and Andrew McCormick",
  title =        "A Real-Time Deep Learning {OFDM} Receiver",
  journal =      j-TRETS,
  volume =       "15",
  number =       "3",
  pages =        "26:1--26:25",
  month =        sep,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3494049",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue May 24 07:29:32 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3494049",
  abstract =     "Machine learning in the physical layer of
                 communication systems holds the potential to improve
                 performance and simplify design methodology. Many
                 algorithms have been proposed; however, the model
                 complexity is often unfeasible for real-time
                 deployment. The \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "26",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Lienen:2022:DDR,
  author =       "Christian Lienen and Marco Platzner",
  title =        "Design of Distributed Reconfigurable Robotics Systems
                 with {ReconROS}",
  journal =      j-TRETS,
  volume =       "15",
  number =       "3",
  pages =        "27:1--27:20",
  month =        sep,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3494571",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue May 24 07:29:32 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3494571",
  abstract =     "Robotics applications process large amounts of data in
                 real time and require compute platforms that provide
                 high performance and energy efficiency. FPGAs are well
                 suited for many of these applications, but there is a
                 reluctance in the robotics community \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "27",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Cahill:2022:AFD,
  author =       "Eli Cahill and Brad Hutchings and Jeffrey Goeders",
  title =        "Approaches for {FPGA} Design Assurance",
  journal =      j-TRETS,
  volume =       "15",
  number =       "3",
  pages =        "28:1--28:29",
  month =        sep,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3491233",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue May 24 07:29:32 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3491233",
  abstract =     "Field-Programmable Gate Arrays (FPGAs) are widely used
                 for custom hardware implementations, including in many
                 security-sensitive industries, such as defense,
                 communications, transportation, medical, and more.
                 Compiling source hardware descriptions to FPGA
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "28",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Faraji:2022:ACC,
  author =       "S. Rasoul Faraji and Pierre Abillama and Kia
                 Bazargan",
  title =        "Approximate Constant-Coefficient Multiplication Using
                 Hybrid Binary-Unary Computing for {FPGAs}",
  journal =      j-TRETS,
  volume =       "15",
  number =       "3",
  pages =        "29:1--29:25",
  month =        sep,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3494570",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue May 24 07:29:32 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3494570",
  abstract =     "Multipliers are used in virtually all Digital Signal
                 Processing (DSP) applications such as image and video
                 processing. Multiplier efficiency has a direct impact
                 on the overall performance of such applications,
                 especially when real-time processing is \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "29",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Du:2022:BAB,
  author =       "Gaoming Du and Bangyi Chen and Zhenmin Li and Zhenxing
                 Tu and Junjie Zhou and Shenya Wang and Qinghao Zhao and
                 Yongsheng Yin and Xiaolei Wang",
  title =        "A {BNN} Accelerator Based on Edge-skip-calculation
                 Strategy and Consolidation Compressed Tree",
  journal =      j-TRETS,
  volume =       "15",
  number =       "3",
  pages =        "30:1--30:20",
  month =        sep,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3494569",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue May 24 07:29:32 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3494569",
  abstract =     "Binarized neural networks (BNNs) and batch
                 normalization (BN) have already become typical
                 techniques in artificial intelligence today.
                 Unfortunately, the massive accumulation and
                 multiplication in BNN models bring challenges to
                 field-programmable gate \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "30",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Dewald:2022:ILP,
  author =       "Florian Dewald and Johanna Rohde and Christian
                 Hochberger and Heiko Mantel",
  title =        "Improving Loop Parallelization by a Combination of
                 Static and Dynamic Analyses in {HLS}",
  journal =      j-TRETS,
  volume =       "15",
  number =       "3",
  pages =        "31:1--31:31",
  month =        sep,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3501801",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue May 24 07:29:32 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3501801",
  abstract =     "High-level synthesis (HLS) can be used to create
                 hardware accelerators for compute-intense software
                 parts such as loop structures. Usually, this process
                 requires significant amount of user interaction to
                 steer kernel selection and optimizations. This can
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "31",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Matthews:2022:QDR,
  author =       "Eric Matthews and Alec Lu and Zhenman Fang and Lesley
                 Shannon",
  title =        "{Quick-Div}: Rethinking Integer Divider Design for
                 {FPGA}-based Soft-processors",
  journal =      j-TRETS,
  volume =       "15",
  number =       "3",
  pages =        "32:1--32:27",
  month =        sep,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3502492",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue May 24 07:29:32 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/fparith.bib;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3502492",
  abstract =     "In today's FPGA-based soft-processors, one of the
                 slowest instructions is integer division. Compared to
                 the low single-digit latency of other arithmetic
                 operations, the fixed 32-cycle latency of radix-2
                 division is substantially longer. Given that today's
                 soft-processors typically only implement radix-2
                 division --- if they support hardware division at all
                 --- there is significant potential to improve the
                 performance of integer dividers.\par

                 In this work, we present a set of high-performance,
                 data-dependent, variable-latency integer dividers for
                 FPGA-based soft-processors that we call Quick-Div. We
                 compare them to various radix-N dividers and provide a
                 thorough analysis in terms of latency and resource
                 usage. In addition, we analyze the frequency scaling
                 for such divider designs when (1) treated as a
                 stand-alone unit and (2) integrated as part of a
                 high-performance soft-processor. Moreover, we provide
                 additional theoretical analysis of different dividers'
                 behaviour and develop a new better-performing Quick-Div
                 variant, called Quick-radix-4. Experimental results
                 show that our Quick-radix-4 design can achieve up to $
                 6.8 \times $ better performance and $ 6.1 \times $
                 better performance-per-LUT over the radix-2 divider for
                 applications such as random number generation. Even in
                 cases where division operations constitute as little as
                 1\% of all executed instructions, Quick-radix-4
                 provides a performance uplift of 16\% compared to the
                 radix-2 divider.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "32",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Roorda:2022:FAE,
  author =       "Esther Roorda and Seyedramin Rasoulinezhad and Philip
                 H. W. Leong and Steven J. E. Wilton",
  title =        "{FPGA} Architecture Exploration for {DNN}
                 Acceleration",
  journal =      j-TRETS,
  volume =       "15",
  number =       "3",
  pages =        "33:1--33:37",
  month =        sep,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3503465",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue May 24 07:29:32 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3503465",
  abstract =     "Recent years have seen an explosion of machine
                 learning applications implemented on Field-Programmable
                 Gate Arrays (FPGAs). FPGA vendors and researchers have
                 responded by updating their fabrics to more efficiently
                 implement machine learning accelerators, \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "33",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Bobda:2022:FFA,
  author =       "Christophe Bobda and Joel Mandebi Mbongue and Paul
                 Chow and Mohammad Ewais and Naif Tarafdar and Juan
                 Camilo Vega and Ken Eguro and Dirk Koch and Suranga
                 Handagala and Miriam Leeser and Martin Herbordt and
                 Hafsah Shahzad and Peter Hofste and Burkhard Ringlein
                 and Jakub Szefer and Ahmed Sanaullah and Russell
                 Tessier",
  title =        "The Future of {FPGA} Acceleration in Datacenters and
                 the Cloud",
  journal =      j-TRETS,
  volume =       "15",
  number =       "3",
  pages =        "34:1--34:42",
  month =        sep,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3506713",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue May 24 07:29:32 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3506713",
  abstract =     "In this article, we survey existing academic and
                 commercial efforts to provide Field-Programmable Gate
                 Array (FPGA) acceleration in datacenters and the cloud.
                 The goal is a critical review of existing systems and a
                 discussion of their evolution from \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "34",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Mentens:2022:ISS,
  author =       "Nele Mentens and Lionel Sousa and Pedro Trancoso",
  title =        "Introduction to the Special Section on {FPL 2020}",
  journal =      j-TRETS,
  volume =       "15",
  number =       "4",
  pages =        "35:1--35:??",
  month =        dec,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3536336",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Mar 11 08:27:16 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3536336",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "35",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Shi:2022:EHF,
  author =       "Runbin Shi and Kaan Kara and Christoph Hagleitner and
                 Dionysios Diamantopoulos and Dimitris Syrivelis and
                 Gustavo Alonso",
  title =        "Exploiting {HBM} on {FPGAs} for Data Processing",
  journal =      j-TRETS,
  volume =       "15",
  number =       "4",
  pages =        "36:1--36:??",
  month =        dec,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3491238",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Mar 11 08:27:16 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3491238",
  abstract =     "Field Programmable Gate Arrays (FPGAs) are
                 increasingly being used in data centers and the cloud
                 due to their potential to accelerate certain workloads
                 as well as for their architectural flexibility, since
                 they can be used as accelerators, smart-NICs, or
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "36",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Nikolic:2022:DPD,
  author =       "Stefan Nikoli{\'c} and Grace Zgheib and Paolo Ienne",
  title =        "Detailed Placement for Dedicated {LUT}-Level {FPGA}
                 Interconnect",
  journal =      j-TRETS,
  volume =       "15",
  number =       "4",
  pages =        "37:1--37:??",
  month =        dec,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3501802",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Mar 11 08:27:16 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3501802",
  abstract =     "In this work, we develop timing-driven CAD support for
                 FPGA architectures with direct connections between
                 LUTs. We do so by proposing an efficient ILP-based
                 detailed placer, which moves a carefully selected
                 subset of LUTs from their original positions, so
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "37",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Zhang:2022:RFH,
  author =       "Niansong Zhang and Xiang Chen and Nachiket Kapre",
  title =        "{RapidLayout}: Fast Hard Block Placement of
                 {FPGA}-optimized Systolic Arrays Using Evolutionary
                 Algorithm",
  journal =      j-TRETS,
  volume =       "15",
  number =       "4",
  pages =        "38:1--38:??",
  month =        dec,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3501803",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Mar 11 08:27:16 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3501803",
  abstract =     "Evolutionary algorithms can outperform conventional
                 placement algorithms such as simulated annealing,
                 analytical placement, and manual placement on runtime,
                 wirelength, pipelining cost, and clock frequency when
                 mapping hard block intensive designs such as \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "38",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Singh:2022:AWP,
  author =       "Gagandeep Singh and Dionysios Diamantopoulos and Juan
                 G{\'o}mez-Luna and Christoph Hagleitner and Sander
                 Stuijk and Henk Corporaal and Onur Mutlu",
  title =        "Accelerating Weather Prediction Using Near-Memory
                 Reconfigurable Fabric",
  journal =      j-TRETS,
  volume =       "15",
  number =       "4",
  pages =        "39:1--39:??",
  month =        dec,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3501804",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Mar 11 08:27:16 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3501804",
  abstract =     "Ongoing climate change calls for fast and accurate
                 weather and climate modeling. However, when solving
                 large-scale weather prediction simulations,
                 state-of-the-art CPU and GPU implementations suffer
                 from limited performance and high energy consumption.
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "39",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Malik:2022:HEA,
  author =       "Gurshaant Malik and Ian Elmore Lang and Rodolfo
                 Pellizzoni and Nachiket Kapre",
  title =        "{HopliteML}: Evolving Application Customized {FPGA
                 NoCs} with Adaptable Routers and Regulators",
  journal =      j-TRETS,
  volume =       "15",
  number =       "4",
  pages =        "40:1--40:??",
  month =        dec,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3507699",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Mar 11 08:27:16 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3507699",
  abstract =     "We can overcome the pessimism in worst-case routing
                 latency analysis of timing-predictable Network-on-Chip
                 (NoC) workloads by single-digit factors through the use
                 of a hybrid field-programmable gate array
                 (FPGA)-optimized NoC and workload-adapted \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "40",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Cook:2022:INU,
  author =       "Hayden Cook and Jacob Arscott and Brent George and
                 Tanner Gaskin and Jeffrey Goeders and Brad Hutchings",
  title =        "Inducing Non-uniform {FPGA} Aging Using
                 Configuration-based Short Circuits",
  journal =      j-TRETS,
  volume =       "15",
  number =       "4",
  pages =        "41:1--41:??",
  month =        dec,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3517042",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Mar 11 08:27:16 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3517042",
  abstract =     "This work demonstrates a novel method of accelerating
                 FPGA aging by configuring FPGAs to implement thousands
                 of short circuits, resulting in high on-chip currents
                 and temperatures. Patterns of ring oscillators are
                 placed across the chip and are used to \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "41",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Leong:2022:ISS,
  author =       "Philip H. W. Leong",
  title =        "Introduction to Special Section on {FPGA} 2021",
  journal =      j-TRETS,
  volume =       "15",
  number =       "4",
  pages =        "42:1--42:??",
  month =        dec,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3536335",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Mar 11 08:27:16 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3536335",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "42",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Lu:2022:DSH,
  author =       "Alec Lu and Zhenman Fang and Lesley Shannon",
  title =        "Demystifying the Soft and Hardened Memory Systems of
                 Modern {FPGAs} for Software Programmers through
                 Microbenchmarking",
  journal =      j-TRETS,
  volume =       "15",
  number =       "4",
  pages =        "43:1--43:??",
  month =        dec,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3517131",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Mar 11 08:27:16 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3517131",
  abstract =     "Both modern datacenter and embedded Field Programmable
                 Gate Arrays (FPGAs) provide great opportunities for
                 high-performance and high-energy-efficiency computing.
                 With the growing public availability of FPGAs from
                 major cloud service providers such as AWS, \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "43",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Chen:2022:TRE,
  author =       "Xinyu Chen and Feng Cheng and Hongshi Tan and Yao Chen
                 and Bingsheng He and Weng-Fai Wong and Deming Chen",
  title =        "{ThunderGP}: Resource-Efficient Graph Processing
                 Framework on {FPGAs} with {HLS}",
  journal =      j-TRETS,
  volume =       "15",
  number =       "4",
  pages =        "44:1--44:??",
  month =        dec,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3517141",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Mar 11 08:27:16 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3517141",
  abstract =     "FPGA has been an emerging computing infrastructure in
                 datacenters benefiting from fine-grained parallelism,
                 energy efficiency, and reconfigurability. Meanwhile,
                 graph processing has attracted tremendous interest in
                 data analytics, and its performance is \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "44",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Langhammer:2022:SNA,
  author =       "Martin Langhammer and Eriko Nurvitadhi and Sergey
                 Gribok and Bogdan Pasca",
  title =        "{Stratix 10 NX} Architecture",
  journal =      j-TRETS,
  volume =       "15",
  number =       "4",
  pages =        "45:1--45:??",
  month =        dec,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3520197",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Mar 11 08:27:16 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3520197",
  abstract =     "The advent of AI has driven the exploration of
                 high-density low-precision arithmetic on FPGAs. This
                 has resulted in new methods in mapping both arithmetic
                 functions as well as dataflows onto the fabric, as well
                 as some changes to the embedded DSP Blocks. \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "45",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Arora:2022:TSF,
  author =       "Aman Arora and Moinak Ghosh and Samidh Mehta and
                 Vaughn Betz and Lizy K. John",
  title =        "Tensor Slices: {FPGA} Building Blocks For the {Deep
                 Learning} Era",
  journal =      j-TRETS,
  volume =       "15",
  number =       "4",
  pages =        "46:1--46:??",
  month =        dec,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3529650",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Mar 11 08:27:16 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3529650",
  abstract =     "FPGAs are well-suited for accelerating deep learning
                 (DL) applications owing to the rapidly changing
                 algorithms, network architectures and computation
                 requirements in this field. However, the generic
                 building blocks available on traditional FPGAs limit
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "46",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Ebcioglu:2022:HPM,
  author =       "Kemal Ebcioglu and Ismail San",
  title =        "Highly Parallel Multi-{FPGA} System Compilation from
                 Sequential {C\slash C++} Code in the {AWS} Cloud",
  journal =      j-TRETS,
  volume =       "15",
  number =       "4",
  pages =        "47:1--47:??",
  month =        dec,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3507698",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Mar 11 08:27:16 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3507698",
  abstract =     "We present a High Level Synthesis compiler that
                 automatically obtains a multi-chip accelerator system
                 from a single-threaded sequential C/C++ application.
                 Invoking the multi-chip accelerator is functionally
                 identical to invoking the single-threaded \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "47",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Panchapakesan:2022:SEA,
  author =       "Sathish Panchapakesan and Zhenman Fang and Jian Li",
  title =        "{SyncNN}: Evaluating and Accelerating Spiking Neural
                 Networks on {FPGAs}",
  journal =      j-TRETS,
  volume =       "15",
  number =       "4",
  pages =        "48:1--48:??",
  month =        dec,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3514253",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Mar 11 08:27:16 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3514253",
  abstract =     "Compared to conventional artificial neural networks,
                 spiking neural networks (SNNs) are more biologically
                 plausible and require less computation due to their
                 event-driven nature of spiking neurons. However, the
                 default asynchronous execution of SNNs also \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "48",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Gibson:2022:ACM,
  author =       "Kahlan Gibson and Esther Roorda and Daniel Holanda
                 Noronha and Steven J. E. Wilton",
  title =        "Adaptive Clock Management of {HLS}-generated Circuits
                 on {FPGAs}",
  journal =      j-TRETS,
  volume =       "15",
  number =       "4",
  pages =        "49:1--49:??",
  month =        dec,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3520140",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Mar 11 08:27:16 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3520140",
  abstract =     "In this article, we present Syncopation, a
                 performance-boosting fine-grained timing analysis and
                 adaptive clock management technique for High-Level
                 Synthesis-generated circuits implemented on
                 Field-Programmable Gate Arrays. The key idea is to use
                 the HLS \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "49",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Sherwin:2022:MFF,
  author =       "Krystine Dawn Sherwin and Kevin I-Kai Wang and Prabu
                 Thiagaraj and Ben Stappers and Oliver Sinnen",
  title =        "Median Filters on {FPGAs} for Infinite Data and Large,
                 Rectangular Windows",
  journal =      j-TRETS,
  volume =       "15",
  number =       "4",
  pages =        "50:1--50:??",
  month =        dec,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3530273",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Mar 11 08:27:16 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3530273",
  abstract =     "Efficient architectures and implementations of median
                 filters have been well investigated in the past. In
                 this article, we focus on median filters for very big
                 scientific applications with very large windows and an
                 infinite stream of data, inspired by big \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "50",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Cong:2022:FHT,
  author =       "Jason Cong and Jason Lau and Gai Liu and Stephen
                 Neuendorffer and Peichen Pan and Kees Vissers and Zhiru
                 Zhang",
  title =        "{FPGA HLS} Today: Successes, Challenges, and
                 Opportunities",
  journal =      j-TRETS,
  volume =       "15",
  number =       "4",
  pages =        "51:1--51:??",
  month =        dec,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3530775",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Mar 11 08:27:16 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3530775",
  abstract =     "The year 2011 marked an important transition for FPGA
                 high-level synthesis (HLS), as it went from prototyping
                 to deployment. A decade later, in this article, we
                 assess the progress of the deployment of HLS technology
                 and highlight the successes in several \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "51",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Sinnen:2023:ISS,
  author =       "Oliver Sinnen and Qiang Liu and Azadeh Davoodi",
  title =        "Introduction to Special Section on {FPT'20}",
  journal =      j-TRETS,
  volume =       "16",
  number =       "1",
  pages =        "1:1--1:??",
  month =        mar,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3579850",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Mar 11 08:27:18 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3579850",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "1",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Shi:2023:OGR,
  author =       "Kaichuang Shi and Xuegong Zhou and Hao Zhou and Lingli
                 Wang",
  title =        "An Optimized {GIB} Routing Architecture with Bent
                 Wires for {FPGA}",
  journal =      j-TRETS,
  volume =       "16",
  number =       "1",
  pages =        "2:1--2:??",
  month =        mar,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3519599",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Mar 11 08:27:18 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3519599",
  abstract =     "Field-programmable gate arrays (FGPAs) are widely used
                 because of the superiority in flexibility and lower
                 non-recurring engineering cost. How to optimize the
                 routing architecture is a key problem for FPGA
                 architects because it has a large impact on FPGA
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "2",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Li:2023:JBA,
  author =       "Xiang Li and Peter Stanwicks and George Provelengios
                 and Russell Tessier and Daniel Holcomb",
  title =        "Jitter-based Adaptive True Random Number Generation
                 Circuits for {FPGAs} in the Cloud",
  journal =      j-TRETS,
  volume =       "16",
  number =       "1",
  pages =        "3:1--3:??",
  month =        mar,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3487554",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Mar 11 08:27:18 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/prng.bib;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3487554",
  abstract =     "In this article, we present and evaluate a true random
                 number generator (TRNG) design that is compatible with
                 the restrictions imposed by cloud-based Field
                 Programmable Gate Array (FPGA) providers such as Amazon
                 Web Services (AWS) EC2 F1. Because cloud \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "3",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Que:2023:RRM,
  author =       "Zhiqiang Que and Hiroki Nakahara and Hongxiang Fan and
                 He Li and Jiuxi Meng and Kuen Hung Tsoi and Xinyu Niu
                 and Eriko Nurvitadhi and Wayne Luk",
  title =        "{Remarn}: a Reconfigurable Multi-threaded Multi-core
                 Accelerator for Recurrent Neural Networks",
  journal =      j-TRETS,
  volume =       "16",
  number =       "1",
  pages =        "4:1--4:??",
  month =        mar,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3534969",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Mar 11 08:27:18 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3534969",
  abstract =     "This work introduces Remarn, a reconfigurable
                 multi-threaded multi-core accelerator supporting both
                 spatial and temporal co-execution of Recurrent Neural
                 Network (RNN) inferences. It increases processing
                 capabilities and quality of service of cloud-based
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "4",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Schelten:2023:HTR,
  author =       "Niklas Schelten and Fritjof Steinert and Justin
                 Knapheide and Anton Schulte and Benno Stabernack",
  title =        "A High-Throughput, Resource-Efficient Implementation
                 of the {RoCEv2} Remote {DMA} Protocol and its
                 Application",
  journal =      j-TRETS,
  volume =       "16",
  number =       "1",
  pages =        "5:1--5:??",
  month =        mar,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3543176",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Mar 11 08:27:18 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3543176",
  abstract =     "The use of application-specific accelerators in data
                 centers has been the state of the art for at least a
                 decade, starting with the availability of General
                 Purpose GPUs achieving higher performance either
                 overall or per watt. In most cases, these \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "5",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Giechaskiel:2023:CVC,
  author =       "Ilias Giechaskiel and Shanquan Tian and Jakub Szefer",
  title =        "{Cross-VM} Covert- and Side-Channel Attacks in Cloud
                 {FPGAs}",
  journal =      j-TRETS,
  volume =       "16",
  number =       "1",
  pages =        "6:1--6:??",
  month =        mar,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3534972",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Mar 11 08:27:18 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3534972",
  abstract =     "The availability of FPGAs in cloud data centers offers
                 rapid, on-demand access to reconfigurable hardware
                 compute resources that users can adapt to their own
                 needs. However, the low-level access to the FPGA
                 hardware and associated resources such as the
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "6",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Wolf:2023:ASE,
  author =       "Dennis Leander Wolf and Christoph Spang and Daniel
                 Diener and Christian Hochberger",
  title =        "Advantages of a Statistical Estimation Approach for
                 Clock Frequency Estimation of Heterogeneous and
                 Irregular {CGRAs}",
  journal =      j-TRETS,
  volume =       "16",
  number =       "1",
  pages =        "7:1--7:??",
  month =        mar,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3531062",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Mar 11 08:27:18 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3531062",
  abstract =     "Estimating the maximum clock frequency of homogeneous
                 Coarse Grained Reconfigurable Arrays/Architectures
                 (CGRAs) with an arbitrary number of Processing Elements
                 (PE) is difficult. Clock frequency estimation of highly
                 heterogeneous CGRAs takes additional \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "7",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Ioannou:2023:SOA,
  author =       "Lenos Ioannou and Suhaib A. Fahmy",
  title =        "Streaming Overlay Architecture for Lightweight {LSTM}
                 Computation on {FPGA SoCs}",
  journal =      j-TRETS,
  volume =       "16",
  number =       "1",
  pages =        "8:1--8:??",
  month =        mar,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3543069",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Mar 11 08:27:18 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3543069",
  abstract =     "Long-Short Term Memory (LSTM) networks, and Recurrent
                 Neural Networks (RNNs) in general, have demonstrated
                 their suitability in many time series data
                 applications, especially in Natural Language Processing
                 (NLP). Computationally, LSTMs introduce \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "8",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Li:2023:SSA,
  author =       "Xiangwei Li and Douglas L. Maskell and Carol Jingyi Li
                 and Philip H. W. Leong and David Boland",
  title =        "A Scalable Systolic Accelerator for Estimation of the
                 Spectral Correlation Density Function and Its {FPGA}
                 Implementation",
  journal =      j-TRETS,
  volume =       "16",
  number =       "1",
  pages =        "9:1--9:??",
  month =        mar,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3546181",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Mar 11 08:27:18 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3546181",
  abstract =     "The spectral correlation density (SCD) function is the
                 time-averaged correlation of two spectral components
                 used for analyzing periodic signals with time-varying
                 spectral content. Although the analysis is extremely
                 powerful, it has not been widely adopted \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "9",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Tao:2023:LGL,
  author =       "Zhuofu Tao and Chen Wu and Yuan Liang and Kun Wang and
                 Lei He",
  title =        "{LW-GCN}: a Lightweight {FPGA}-based Graph
                 Convolutional Network Accelerator",
  journal =      j-TRETS,
  volume =       "16",
  number =       "1",
  pages =        "10:1--10:??",
  month =        mar,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3550075",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Mar 11 08:27:18 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3550075",
  abstract =     "Graph convolutional networks (GCNs) have been
                 introduced to effectively process non-Euclidean graph
                 data. However, GCNs incur large amounts of irregularity
                 in computation and memory access, which prevents
                 efficient use of traditional neural network \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "10",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Moini:2023:VSI,
  author =       "Shayan Moini and Aleksa Deric and Xiang Li and George
                 Provelengios and Wayne Burleson and Russell Tessier and
                 Daniel Holcomb",
  title =        "Voltage Sensor Implementations for Remote Power
                 Attacks on {FPGAs}",
  journal =      j-TRETS,
  volume =       "16",
  number =       "1",
  pages =        "11:1--11:??",
  month =        mar,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3555048",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Mar 11 08:27:18 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3555048",
  abstract =     "This article presents a study of two types of on-chip
                 FPGA voltage sensors based on ring oscillators (ROs)
                 and time-to-digital converter (TDCs), respectively. It
                 has previously been shown that these sensors are often
                 used to extract side-channel \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "11",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Kalantar:2023:FBA,
  author =       "Amin Kalantar and Zachary Zimmerman and Philip Brisk",
  title =        "{FPGA}-based Acceleration of Time Series Similarity
                 Prediction: From Cloud to Edge",
  journal =      j-TRETS,
  volume =       "16",
  number =       "1",
  pages =        "12:1--12:??",
  month =        mar,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3555810",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Mar 11 08:27:18 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3555810",
  abstract =     "With the proliferation of low-cost sensors and the
                 Internet of Things, the rate of producing data far
                 exceeds the compute and storage capabilities of today's
                 infrastructure. Much of this data takes the form of
                 time series, and in response, there has been \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "12",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Vestias:2023:EDL,
  author =       "M{\'a}rio V{\'e}stias and Rui P. Duarte and Jos{\'e}
                 T. de Sousa and Hor{\'a}cio Neto",
  title =        "Efficient Design of Low Bitwidth Convolutional Neural
                 Networks on {FPGA} with Optimized Dot Product Units",
  journal =      j-TRETS,
  volume =       "16",
  number =       "1",
  pages =        "13:1--13:??",
  month =        mar,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3546182",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Mar 11 08:27:18 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3546182",
  abstract =     "Designing hardware accelerators to run the inference
                 of convolutional neural networks (CNN) is under
                 intensive research. Several different architectures
                 have been proposed along with hardware-oriented
                 optimizations of the neural network models. One of the
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "13",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{deMoura:2023:DCR,
  author =       "Rafael F{\~a}o de Moura and Joao Paulo Cardoso de Lima
                 and Luigi Carro",
  title =        "Data and Computation Reuse in {CNNs} Using Memristor
                 {TCAMs}",
  journal =      j-TRETS,
  volume =       "16",
  number =       "1",
  pages =        "14:1--14:??",
  month =        mar,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3549536",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Mar 11 08:27:18 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3549536",
  abstract =     "Exploiting computational and data reuse in CNNs is
                 crucial for the successful design of
                 resource-constrained platforms. In image recognition
                 applications, high levels of input locality and
                 redundancy present in CNNs have become the golden goose
                 for \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "14",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Abdelhamid:2023:SMC,
  author =       "Riadh {Ben Abdelhamid} and Yoshiki Yamaguchi and
                 Taisuke Boku",
  title =        "A Scalable Many-core Overlay Architecture on an
                 {HBM2}-enabled Multi-Die {FPGA}",
  journal =      j-TRETS,
  volume =       "16",
  number =       "1",
  pages =        "15:1--15:??",
  month =        mar,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3547657",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Mar 11 08:27:18 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3547657",
  abstract =     "The overlay architecture enables to raise the
                 abstraction level of hardware design and enhances
                 hardware-accelerated applications' portability. In
                 FPGAs, there is a growing awareness of the overlay
                 structure as typified by many-core architecture. It
                 works \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "15",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Iskandar:2023:NMC,
  author =       "Veronia Iskandar and Mohamed A. {Abd El Ghany} and
                 Diana G{\"o}hringer",
  title =        "Near-memory Computing on {FPGAs} with {$3$D}-stacked
                 Memories: Applications, Architectures, and
                 Optimizations",
  journal =      j-TRETS,
  volume =       "16",
  number =       "1",
  pages =        "16:1--16:??",
  month =        mar,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3547658",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Mar 11 08:27:18 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3547658",
  abstract =     "The near-memory computing (NMC) paradigm has
                 transpired as a promising method for overcoming the
                 memory wall challenges of future computing
                 architectures. Modern systems integrating 3D-stacked
                 DRAM memory can be leveraged to prevent unnecessary
                 data \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "16",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Shahsavani:2023:ECM,
  author =       "Soheil Nazar Shahsavani and Arash Fayyazi and Mahdi
                 Nazemi and Massoud Pedram",
  title =        "Efficient Compilation and Mapping of Fixed Function
                 Combinational Logic onto Digital Signal Processors
                 Targeting Neural Network Inference and Utilizing
                 High-level Synthesis",
  journal =      j-TRETS,
  volume =       "16",
  number =       "2",
  pages =        "17:1--17:??",
  month =        jun,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3559543",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Mon Jul 3 07:48:36 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3559543",
  abstract =     "Recent efforts for improving the performance of neural
                 network (NN) accelerators that meet today's application
                 requirements have given rise to a new trend of
                 logic-based NN inference relying on fixed function
                 combinational logic. Mapping such large \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "17",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Choi:2023:FAP,
  author =       "Young-Kyu Choi and Carlos Santillana and Yujia Shen
                 and Adnan Darwiche and Jason Cong",
  title =        "{FPGA} Acceleration of Probabilistic Sentential
                 Decision Diagrams with High-level Synthesis",
  journal =      j-TRETS,
  volume =       "16",
  number =       "2",
  pages =        "18:1--18:??",
  month =        jun,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3561514",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Mon Jul 3 07:48:36 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3561514",
  abstract =     "Probabilistic Sentential Decision Diagrams (PSDDs)
                 provide efficient methods for modeling and reasoning
                 with probability distributions in the presence of
                 massive logical constraints. PSDDs can also be
                 synthesized from graphical models such as Bayesian
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "18",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Ganewattha:2023:HAR,
  author =       "Chanaka Ganewattha and Zaheer Khan and Janne
                 Lehtom{\"a}ki and Matti Latva-Aho",
  title =        "Hardware-accelerated Real-time Drift-awareness for
                 Robust Deep Learning on Wireless {RF} Data",
  journal =      j-TRETS,
  volume =       "16",
  number =       "2",
  pages =        "19:1--19:??",
  month =        jun,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3563394",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Mon Jul 3 07:48:36 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3563394",
  abstract =     "Proactive and intelligent management of network
                 resource utilization (RU) using deep learning (DL) can
                 significantly improve the efficiency and performance of
                 the next generation of wireless networks. However,
                 variations in wireless RU are often affected \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "19",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Proulx:2023:SFC,
  author =       "Alexandre Proulx and Jean-Yves Chouinard and Paul
                 Fortier and Amine Miled",
  title =        "A Survey on {FPGA} Cybersecurity Design Strategies",
  journal =      j-TRETS,
  volume =       "16",
  number =       "2",
  pages =        "20:1--20:??",
  month =        jun,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3561515",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Mon Jul 3 07:48:36 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3561515",
  abstract =     "This article presents a critical literature review on
                 the security aspects of field-programmable gate array
                 (FPGA) devices. FPGA devices present unique challenges
                 to cybersecurity through their reconfigurable nature.
                 The article also pays special \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "20",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Soldavini:2023:ACH,
  author =       "Stephanie Soldavini and Karl Friebel and Mattia
                 Tibaldi and Gerald Hempel and Jeronimo Castrillon and
                 Christian Pilato",
  title =        "Automatic Creation of High-bandwidth Memory
                 Architectures from Domain-specific Languages: The Case
                 of Computational Fluid Dynamics",
  journal =      j-TRETS,
  volume =       "16",
  number =       "2",
  pages =        "21:1--21:??",
  month =        jun,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3563553",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Mon Jul 3 07:48:36 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3563553",
  abstract =     "Numerical simulations can help solve complex problems.
                 Most of these algorithms are massively parallel and
                 thus good candidates for FPGA acceleration thanks to
                 spatial parallelism. Modern FPGA devices can leverage
                 high-bandwidth memory technologies, but \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "21",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Yang:2023:HOF,
  author =       "Gangqiang Yang and Zhengyuan Shi and Cheng Chen and
                 Hailiang Xiong and Fudong Li and Honggang Hu and Zhiguo
                 Wan",
  title =        "Hardware Optimizations of {Fruit-80} Stream Cipher:
                 Smaller than Grain",
  journal =      j-TRETS,
  volume =       "16",
  number =       "2",
  pages =        "22:1--22:??",
  month =        jun,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3569455",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Mon Jul 3 07:48:36 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/cryptography2020.bib;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3569455",
  abstract =     "Fruit-80, which emerged as an ultra-lightweight stream
                 cipher with 80-bit secret key, is oriented toward
                 resource-constrained devices in the Internet of Things.
                 In this article, we propose area and speed optimization
                 architectures of Fruit-80 on FPGAs. \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "22",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Basalama:2023:FEE,
  author =       "Suhail Basalama and Atefeh Sohrabizadeh and Jie Wang
                 and Licheng Guo and Jason Cong",
  title =        "{FlexCNN}: an End-to-end Framework for Composing {CNN}
                 Accelerators on {FPGA}",
  journal =      j-TRETS,
  volume =       "16",
  number =       "2",
  pages =        "23:1--23:??",
  month =        jun,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3570928",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Mon Jul 3 07:48:36 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3570928",
  abstract =     "With reduced data reuse and parallelism, recent
                 convolutional neural networks (CNNs) create new
                 challenges for FPGA acceleration. Systolic arrays (SAs)
                 are efficient, scalable architectures for convolutional
                 layers, but without proper optimizations, their
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "23",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Meyer:2023:MFD,
  author =       "Marius Meyer and Tobias Kenter and Christian Plessl",
  title =        "{Multi-FPGA} Designs and Scaling of {HPC} Challenge
                 Benchmarks via {MPI} and Circuit-switched Inter-{FPGA}
                 Networks",
  journal =      j-TRETS,
  volume =       "16",
  number =       "2",
  pages =        "24:1--24:??",
  month =        jun,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3576200",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Mon Jul 3 07:48:36 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3576200",
  abstract =     "While FPGA accelerator boards and their respective
                 high-level design tools are maturing, there is still a
                 lack of multi-FPGA applications, libraries, and not
                 least, benchmarks and reference implementations towards
                 sustained HPC usage of these devices. As \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "24",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Ueno:2023:VVC,
  author =       "Tomohiro Ueno and Kentaro Sano",
  title =        "{VCSN}: Virtual Circuit-Switching Network for Flexible
                 and Simple-to-Operate Communication in {HPC FPGA}
                 Cluster",
  journal =      j-TRETS,
  volume =       "16",
  number =       "2",
  pages =        "25:1--25:??",
  month =        jun,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3579848",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Mon Jul 3 07:48:36 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3579848",
  abstract =     "FPGA clusters promise to play a critical role in
                 high-performance computing (HPC) systems in the near
                 future due to their flexibility and high power
                 efficiency. The operation of large-scale
                 general-purpose FPGA clusters on which multiple users
                 run diverse \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "25",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Nayak:2023:IEE,
  author =       "Ankita Nayak and Keyi Zhang and Rajsekhar Setaluri and
                 Alex Carsello and Makai Mann and Christopher Torng and
                 Stephen Richardson and Rick Bahr and Pat Hanrahan and
                 Mark Horowitz and Priyanka Raina",
  title =        "Improving Energy Efficiency of {CGRAs} with
                 Low-Overhead Fine-Grained Power Domains",
  journal =      j-TRETS,
  volume =       "16",
  number =       "2",
  pages =        "26:1--26:??",
  month =        jun,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3558394",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Mon Jul 3 07:48:36 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3558394",
  abstract =     "To effectively minimize static power for a wide range
                 of applications, power domains for coarse-grained
                 reconfigurable array (CGRA) architectures need to be
                 more fine-grained than those found in a typical
                 application-specific integrated circuit. However,
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "26",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Zhao:2023:ASC,
  author =       "Kang Zhao and Yuchun Ma and Ruining He and Jixing
                 Zhang and Ning Xu and Jinian Bian",
  title =        "Adaptive Selection and Clustering of Partial
                 Reconfiguration Modules for Modern {FPGA} Design Flow",
  journal =      j-TRETS,
  volume =       "16",
  number =       "2",
  pages =        "27:1--27:??",
  month =        jun,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3567427",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Mon Jul 3 07:48:36 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3567427",
  abstract =     "Dynamic Partially Reconfiguration (DPR) on FPGA has
                 attracted significant research interest in recent years
                 since it provides benefits such as reduced area and
                 flexible functionality. However, due to the lack of
                 supporting synthesis tools in the current \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "27",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Tian:2023:SSA,
  author =       "Xingyu Tian and Zhifan Ye and Alec Lu and Licheng Guo
                 and Yuze Chi and Zhenman Fang",
  title =        "{SASA}: a Scalable and Automatic Stencil Acceleration
                 Framework for Optimized Hybrid Spatial and Temporal
                 Parallelism on {HBM}-based {FPGAs}",
  journal =      j-TRETS,
  volume =       "16",
  number =       "2",
  pages =        "28:1--28:??",
  month =        jun,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3572547",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Mon Jul 3 07:48:36 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3572547",
  abstract =     "Stencil computation is one of the fundamental
                 computing patterns in many application domains such as
                 scientific computing and image processing. While there
                 are promising studies that accelerate stencils on
                 FPGAs, there lacks an automated acceleration \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "28",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{M:2023:DAR,
  author =       "Dhayalakumar M. and Noor Mahammad Sk",
  title =        "Deterministic Approach for Range-enhanced
                 Reconfigurable Packet Classification Engine",
  journal =      j-TRETS,
  volume =       "16",
  number =       "2",
  pages =        "29:1--29:??",
  month =        jun,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3586577",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Mon Jul 3 07:48:36 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3586577",
  abstract =     "Reconfigurable hardware is a promising technology for
                 implementing firewalls, routing mechanisms, and new
                 protocols for evolving high-performance network
                 systems. This work presents a novel deterministic
                 approach for a Range-enhanced Reconfigurable Packet
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "29",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Koch:2023:ISI,
  author =       "Andreas Koch and Wei Zhang",
  title =        "Introduction to the Special Issue on {FPT 2021}",
  journal =      j-TRETS,
  volume =       "16",
  number =       "2",
  pages =        "30:1--30:??",
  month =        jun,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3603701",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Mon Jul 3 07:48:36 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3603701",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "30",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Attia:2023:TSL,
  author =       "Sameh Attia and Vaughn Betz",
  title =        "Toward Software-like Debugging for {FPGAs} via
                 Checkpointing and Transaction-based Co-Simulation",
  journal =      j-TRETS,
  volume =       "16",
  number =       "2",
  pages =        "31:1--31:??",
  month =        jun,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3552521",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Mon Jul 3 07:48:36 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3552521",
  abstract =     "Checkpoint-based debugging flows have recently been
                 developed that allow the user to move the design state
                 back and forth between an FPGA and a simulator. They
                 provide a softwarelike debugging experience by
                 combining the speed of hardware execution and
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "31",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Gebauer:2023:QMR,
  author =       "Richard Gebauer and Nick Karcher and Mehmed G{\"u}ler
                 and Oliver Sander",
  title =        "{QiCells}: a Modular {RFSoC}-based Approach to
                 Interface Superconducting Quantum Bits",
  journal =      j-TRETS,
  volume =       "16",
  number =       "2",
  pages =        "32:1--32:??",
  month =        jun,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3571820",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Mon Jul 3 07:48:36 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3571820",
  abstract =     "Quantum computers will be a revolutionary extension of
                 the heterogeneous computing world. They consist of many
                 quantum bits (qubits) and require a careful design of
                 the interface between the classical computer
                 architecture and the quantum processor. For \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "32",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Suh:2023:AHC,
  author =       "Han-Sok Suh and Jian Meng and Ty Nguyen and Vijay
                 Kumar and Yu Cao and Jae-Sun Seo",
  title =        "Algorithm--hardware Co-optimization for
                 Energy-efficient Drone Detection on
                 Resource-constrained {FPGA}",
  journal =      j-TRETS,
  volume =       "16",
  number =       "2",
  pages =        "33:1--33:??",
  month =        jun,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3583074",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Mon Jul 3 07:48:36 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3583074",
  abstract =     "Convolutional neural network (CNN)-based object
                 detection has achieved very high accuracy; e.g.,
                 single-shot multi-box detectors (SSDs) can efficiently
                 detect and localize various objects in an input image.
                 However, they require a high amount of \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "33",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Bucknall:2023:ZEE,
  author =       "Alex R. Bucknall and Suhaib A. Fahmy",
  title =        "{ZyPR}: End-to-end Build Tool and Runtime Manager for
                 Partial Reconfiguration of {FPGA SoCs} at the Edge",
  journal =      j-TRETS,
  volume =       "16",
  number =       "3",
  pages =        "34:1--34:??",
  month =        sep,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3585521",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Aug 19 07:37:30 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3585521",
  abstract =     "Partial reconfiguration (PR) is a key enabler to the
                 design and development of adaptive systems on modern
                 Field Programmable Gate Array (FPGA) Systems-on-Chip
                 (SoCs), allowing hardware to be adapted dynamically at
                 runtime. Vendor-supported PR \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "34",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Corts:2023:SPS,
  author =       "Reinout Corts and Nikolaos Alachiotis",
  title =        "A Survey of Processing Systems for Phylogenetics and
                 Population Genetics",
  journal =      j-TRETS,
  volume =       "16",
  number =       "3",
  pages =        "35:1--35:??",
  month =        sep,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3588033",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Aug 19 07:37:30 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3588033",
  abstract =     "The COVID-19 pandemic brought Bioinformatics into the
                 spotlight, revealing that several existing methods,
                 algorithms, and tools were not well prepared to handle
                 large amounts of genomic data efficiently. This led to
                 prohibitively long execution times and \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "35",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Machado:2023:NNH,
  author =       "Pedro Machado and Jo{\~a}o Filipe Ferreira and Andreas
                 Oikonomou and T. M. McGinnity",
  title =        "{NeuroHSMD}: Neuromorphic Hybrid Spiking Motion
                 Detector",
  journal =      j-TRETS,
  volume =       "16",
  number =       "3",
  pages =        "36:1--36:??",
  month =        sep,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3588318",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Aug 19 07:37:30 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3588318",
  abstract =     "Vertebrate retinas are highly-efficient in processing
                 trivial visual tasks such as detecting moving objects,
                 which still represent complex challenges for modern
                 computers. In vertebrates, the detection of object
                 motion is performed by specialised retinal \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "36",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Geethakumari:2023:SAC,
  author =       "Prajith Ramakrishnan Geethakumari and Ioannis
                 Sourdis",
  title =        "Stream Aggregation with Compressed Sliding {Windows}",
  journal =      j-TRETS,
  volume =       "16",
  number =       "3",
  pages =        "37:1--37:??",
  month =        sep,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3590774",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Aug 19 07:37:30 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3590774",
  abstract =     "High performance stream aggregation is critical for
                 many emerging applications that analyze massive volumes
                 of data. Incoming data needs to be stored in a sliding
                 window during processing, in case the aggregation
                 functions cannot be computed \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "37",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Karakchi:2023:NND,
  author =       "Rasha Karakchi and Jason D. Bakos",
  title =        "{NAPOLY}: a Non-deterministic Automata Processor
                 {OverLaY}",
  journal =      j-TRETS,
  volume =       "16",
  number =       "3",
  pages =        "38:1--38:??",
  month =        sep,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3593586",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Aug 19 07:37:30 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3593586",
  abstract =     "Deterministic and Non-deterministic Finite Automata
                 (DFA and NFA) comprise the core of many big data
                 applications. Recent efforts to develop Domain-Specific
                 Architectures (DSAs) for DFA/NFA have taken divergent
                 approaches, but achieving consistent \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "38",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Raut:2023:EAE,
  author =       "Gopal Raut and Saurabh Karkun and Santosh Kumar
                 Vishvakarma",
  title =        "An Empirical Approach to Enhance Performance for
                 Scalable {CORDIC}-Based Deep Neural Networks",
  journal =      j-TRETS,
  volume =       "16",
  number =       "3",
  pages =        "39:1--39:??",
  month =        sep,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3596220",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Aug 19 07:37:30 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3596220",
  abstract =     "Practical implementation of deep neural networks
                 (DNNs) demands significant hardware resources,
                 necessitating high computational power and memory
                 bandwidth. While existing field-programmable gate array
                 (FPGA)-based DNN accelerators are primarily optimized
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "39",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Leeser:2023:AEA,
  author =       "Miriam Leeser",
  title =        "Artifact Evaluation for {ACM TRETS} Papers Submitted
                 from the {FPT} Journal Track",
  journal =      j-TRETS,
  volume =       "16",
  number =       "3",
  pages =        "40:1--40:??",
  month =        sep,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3596513",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Aug 19 07:37:30 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3596513",
  abstract =     "Authors of papers that were accepted to ACM TRETS via
                 the FPT 2022 journal track had the option of
                 participating in Artifact Evaluation (AE). Four papers
                 from this track volunteered to participate in the AE
                 process. All of these papers have been awarded
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "40",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Li:2023:FPF,
  author =       "Carol Jingyi Li and Xiangwei Li and Binglei Lou and
                 Craig T. Jin and David Boland and Philip H. W. Leong",
  title =        "Fixed-point {FPGA} Implementation of the {FFT}
                 Accumulation Method for Real-time Cyclostationary
                 Analysis",
  journal =      j-TRETS,
  volume =       "16",
  number =       "3",
  pages =        "41:1--41:??",
  month =        sep,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3567429",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Aug 19 07:37:30 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3567429",
  abstract =     "The spectral correlation density (SCD) is an important
                 tool in cyclostationary signal detection and
                 classification. Even using efficient techniques based
                 on the fast Fourier transform (FFT), real-time
                 implementations are challenging because of the high
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "41",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Lou:2023:FCF,
  author =       "Binglei Lou and David Boland and Philip Leong",
  title =        "{fSEAD}: a Composable {FPGA}-based Streaming Ensemble
                 Anomaly Detection Library",
  journal =      j-TRETS,
  volume =       "16",
  number =       "3",
  pages =        "42:1--42:??",
  month =        sep,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3568992",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Aug 19 07:37:30 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3568992",
  abstract =     "Machine learning ensembles combine multiple base
                 models to produce a more accurate output. They can be
                 applied to a range of machine learning problems,
                 including anomaly detection. In this article, we
                 investigate how to maximize the composability and
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "42",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Shi:2023:DSE,
  author =       "Zhengyuan Shi and Cheng Chen and Gangqiang Yang and
                 Hailiang Xiong and Fudong Li and Honggang Hu and Zhiguo
                 Wan",
  title =        "Design Space Exploration of {Galois} and {Fibonacci}
                 Configuration Based on {Espresso} Stream Cipher",
  journal =      j-TRETS,
  volume =       "16",
  number =       "3",
  pages =        "43:1--43:??",
  month =        sep,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3567428",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Aug 19 07:37:30 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/fibquart.bib;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3567428",
  abstract =     "Fibonacci and Galois are two different kinds of
                 configurations in stream ciphers. Although many
                 transformations between two configurations have been
                 proposed, there is no sufficient analysis of their FPGA
                 performance. Espresso stream cipher provides an
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "43",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Mao:2023:HPC,
  author =       "Gaoyu Mao and Donglong Chen and Guangyan Li and
                 Wangchen Dai and Abdurrashid Ibrahim Sanka and
                 {\c{C}}etin Kaya Ko{\c{c}} and Ray C. C. Cheung",
  title =        "High-performance and Configurable {SW\slash HW}
                 Co-design of Post-quantum Signature
                 {CRYSTALS-Dilithium}",
  journal =      j-TRETS,
  volume =       "16",
  number =       "3",
  pages =        "44:1--44:??",
  month =        sep,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3569456",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Aug 19 07:37:30 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/cryptography2020.bib;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3569456",
  abstract =     "CRYSTALS-Dilithium is a lattice-based post-quantum
                 digital signature scheme that is resistant to attacks
                 by quantum computers and has been selected to be
                 standardized in the NIST post-quantum cryptography
                 (PQC) standardization process. However, the speed
                 performance and design flexibility of the Dilithium
                 still need to be evaluated. This article presents a
                 high-performance software\slash hardware co-design of
                 CRYSTALS-Dilithium based on the NIST PQC round-3
                 parameters. High-speed pipelined hardware modules for
                 NTT\slash INTT, point-wise multiplication\slash
                 addition, and for SHAKE are included in the design to
                 accelerate the time-consuming operations in Dilithium.
                 All hardware modules are parameterized, thus allowing
                 full support of runtime configuration to increase
                 versatility. Moreover, the proposed software\slash
                 hardware architecture and tight operating workflows
                 reduce the data transmission overhead between the
                 processor and other hardware modules. The hardware
                 accelerator is implemented with a reconfigurable logic
                 on FPGA and is integrated with the high-performance ARM
                 Cortex-A9 processor in the Xilinx Zynq Architecture. We
                 measure the performance of the software\slash hardware
                 system for Dilithium in NIST security levels 2, 3, and
                 5. Compared to pure software implementations, we
                 achieve 8.7--12.5 times speedup in Key generation,
                 6.3--7.3 times speedup in Sign, and 9.1--12.2 times
                 speedup in Verify operations.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "44",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{He:2023:FIC,
  author =       "Pengzhou He and Tianyou Bao and Jiafeng Xie and
                 Moeness Amin",
  title =        "{FPGA} Implementation of Compact Hardware Accelerators
                 for Ring-Binary-{LWE}-based Post-quantum Cryptography",
  journal =      j-TRETS,
  volume =       "16",
  number =       "3",
  pages =        "45:1--45:??",
  month =        sep,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3569457",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Aug 19 07:37:30 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/cryptography2020.bib;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3569457",
  abstract =     "Post-quantum cryptography (PQC) has recently drawn
                 substantial attention from various communities owing to
                 the proven vulnerability of existing public-key
                 cryptosystems against the attacks launched from
                 well-established quantum computers. The
                 Ring-Binary-Learning-with-Errors (RBLWE), a variant of
                 Ring-LWE, has been proposed to build PQC for
                 lightweight applications. As more Field-Programmable
                 Gate Array (FPGA) devices are being deployed in
                 lightweight applications like Internet-of-Things (IoT)
                 devices, it would be interesting if the RBLWE-based PQC
                 can be implemented on the FPGA with ultra-low
                 complexity and flexible processing. However, thus far,
                 limited information is available for such
                 implementations. In this article, we propose novel
                 RBLWE-based PQC accelerators on the FPGA with ultra-low
                 implementation complexity and flexible timing. We first
                 present the process of deriving the key operation of
                 the RBLWE-based scheme into the proposed algorithmic
                 operation. The corresponding hardware accelerator is
                 then efficiently mapped from the proposed algorithm
                 with the help of algorithm-to-architecture
                 implementation techniques and extended to obtain
                 higher-throughput designs. The final complexity
                 analysis and implementation results (on a variety of
                 FPGAs) show that the proposed accelerators have
                 significantly smaller area-time complexities than the
                 state-of-the-art designs. Overall, the proposed
                 accelerators feature low implementation complexity and
                 flexible processing, making them desirable for emerging
                 FPGA-based lightweight applications.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "45",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Jun:2023:ASD,
  author =       "Hyegang Jun and Hanchen Ye and Hyunmin Jeong and
                 Deming Chen",
  title =        "{AutoScaleDSE}: a Scalable Design Space Exploration
                 Engine for High-Level Synthesis",
  journal =      j-TRETS,
  volume =       "16",
  number =       "3",
  pages =        "46:1--46:??",
  month =        sep,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3572959",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Aug 19 07:37:30 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3572959",
  abstract =     "High-Level Synthesis (HLS) has enabled users to
                 rapidly develop designs targeted for FPGAs from the
                 behavioral description of the design. However, to
                 synthesize an optimal design capable of taking better
                 advantage of the target FPGA, a considerable amount
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "46",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Chang:2023:AHC,
  author =       "Liang Chang and Xin Zhao and Jun Zhou",
  title =        "{ADAS}: a High Computational Utilization Dynamic
                 Reconfigurable Hardware Accelerator for Super
                 Resolution",
  journal =      j-TRETS,
  volume =       "16",
  number =       "3",
  pages =        "47:1--47:??",
  month =        sep,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3570927",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Aug 19 07:37:30 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3570927",
  abstract =     "Super-resolution (SR) based on deep learning has
                 obtained superior performance in image reconstruction.
                 Recently, various algorithm efforts have been committed
                 to improving image reconstruction quality and speed.
                 However, the inference of SR contains huge \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "47",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Skubich:2023:IRT,
  author =       "Christian Skubich and Peter Reichel and Marc
                 Reichenbach",
  title =        "Increasing the Robustness of {TERO-TRNGs} Against
                 Process Variation",
  journal =      j-TRETS,
  volume =       "16",
  number =       "3",
  pages =        "48:1--48:??",
  month =        sep,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3597418",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Aug 19 07:37:30 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3597418",
  abstract =     "The transition effect ring oscillator is a popular
                 design for building entropy sources because it is
                 compact, built from digital elements only, and is very
                 well suited for FPGAs. However, it is known to be quite
                 sensitive to process variation. Although \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "48",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Fiege:2023:BBS,
  author =       "Nicolai Fiege and Peter Zipf",
  title =        "{BLOOP}: {Boolean} Satisfiability-based Optimized Loop
                 Pipelining",
  journal =      j-TRETS,
  volume =       "16",
  number =       "3",
  pages =        "49:1--49:??",
  month =        sep,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3599972",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Aug 19 07:37:30 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3599972",
  abstract =     "Modulo scheduling is the premier technique for
                 throughput maximization of loops in high-level
                 synthesis by interleaving consecutive loop iterations.
                 The number of clock cycles between data insertions is
                 called the initiation interval (II). For throughput
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "49",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Arora:2023:CDC,
  author =       "Aman Arora and Atharva Bhamburkar and Aatman Borda and
                 Tanmay Anand and Rishabh Sehgal and Bagus Hanindhito
                 and Pierre-Emmanuel Gaillardon and Jaydeep Kulkarni and
                 Lizy K. John",
  title =        "{CoMeFa}: Deploying Compute-in-Memory on {FPGAs} for
                 Deep Learning Acceleration",
  journal =      j-TRETS,
  volume =       "16",
  number =       "3",
  pages =        "50:1--50:??",
  month =        sep,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3603504",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Aug 19 07:37:30 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3603504",
  abstract =     "Block random access memories (BRAMs) are the storage
                 houses of FPGAs, providing extensive on-chip memory
                 bandwidth to the compute units implemented using logic
                 blocks and digital signal processing slices. We propose
                 modifying BRAMs to convert them to \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "50",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Li:2023:ISS,
  author =       "Jing Li and Martin Herbordt",
  title =        "Introduction to the Special Section on {FCCM 2022}",
  journal =      j-TRETS,
  volume =       "16",
  number =       "4",
  pages =        "51:1--51:??",
  month =        dec,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3632092",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Fri Dec 22 06:11:49 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3632092",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "51",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Wu:2023:TEA,
  author =       "Guiming Wu and Qianwen He and Jiali Jiang and
                 Zhenxiang Zhang and Yuan Zhao and Yinchao Zou and Jie
                 Zhang and Changzheng Wei and Ying Yan and Hui Zhang",
  title =        "{Topgun}: an {ECC} Accelerator for Private Set
                 Intersection",
  journal =      j-TRETS,
  volume =       "16",
  number =       "4",
  pages =        "52:1--52:??",
  month =        dec,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3603114",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Fri Dec 22 06:11:49 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/cryptography2020.bib;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3603114",
  abstract =     "Elliptic Curve Cryptography (ECC), one of the most
                 widely used asymmetric cryptographic algorithms, has
                 been deployed in Transport Layer Security (TLS)
                 protocol, blockchain, secure multiparty computation,
                 and so on. As one of the most secure ECC curves,
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "52",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Xu:2023:FAG,
  author =       "Tiancheng Xu and Scott Rixner and Alan L. Cox",
  title =        "An {FPGA} Accelerator for Genome Variant Calling",
  journal =      j-TRETS,
  volume =       "16",
  number =       "4",
  pages =        "53:1--53:??",
  month =        dec,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3595297",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Fri Dec 22 06:11:49 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3595297",
  abstract =     "In genome analysis, it is often important to identify
                 variants from a reference genome. However, identifying
                 variants that occur with low frequency can be
                 challenging, as it is computationally intensive to do
                 so accurately. LoFreq is a widely used program
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "53",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Josipovic:2023:RSD,
  author =       "Lana Josipovi{\'c} and Axel Marmet and Andrea
                 Guerrieri and Paolo Ienne",
  title =        "Resource Sharing in Dataflow Circuits",
  journal =      j-TRETS,
  volume =       "16",
  number =       "4",
  pages =        "54:1--54:??",
  month =        dec,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3597614",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Fri Dec 22 06:11:49 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3597614",
  abstract =     "To achieve resource-efficient hardware designs,
                 high-level synthesis (HLS) tools share (i.e.,
                 time-multiplex) functional units among operations of
                 the same type. This optimization is typically performed
                 in conjunction with operation scheduling to ensure
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "54",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Cheng:2023:PCF,
  author =       "Jianyi Cheng and Lana Josipovi{\'c} and John Wickerson
                 and George A. Constantinides",
  title =        "Parallelising Control Flow in Dynamic-scheduling
                 High-level Synthesis",
  journal =      j-TRETS,
  volume =       "16",
  number =       "4",
  pages =        "55:1--55:??",
  month =        dec,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3599973",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Fri Dec 22 06:11:49 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3599973",
  abstract =     "Recently, there is a trend to use high-level synthesis
                 (HLS) tools to generate dynamically scheduled hardware.
                 The generated hardware is made up of components
                 connected using handshake signals. These handshake
                 signals schedule the components at runtime \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "55",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Ienne:2023:ISS,
  author =       "Paolo Ienne",
  title =        "Introduction to the Special Section on {FPGA 2022}",
  journal =      j-TRETS,
  volume =       "16",
  number =       "4",
  pages =        "56:1--56:??",
  month =        dec,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3618114",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Fri Dec 22 06:11:49 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3618114",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "56",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Wang:2023:LSL,
  author =       "Erwei Wang and Marie Auffret and Georgios-Ilias
                 Stavrou and Peter Y. K. Cheung and George A.
                 Constantinides and Mohamed S. Abdelfattah and James J.
                 Davis",
  title =        "Logic Shrinkage: Learned Connectivity Sparsification
                 for {LUT}-Based Neural Networks",
  journal =      j-TRETS,
  volume =       "16",
  number =       "4",
  pages =        "57:1--57:??",
  month =        dec,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3583075",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Fri Dec 22 06:11:49 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3583075",
  abstract =     "Field-programmable gate array (FPGA)-specific deep
                 neural network (DNN) architectures using native lookup
                 tables (LUTs) as independently trainable inference
                 operators have been shown to achieve favorable
                 area-accuracy and energy-accuracy trade-offs. The
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "57",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Gao:2023:RAR,
  author =       "Yizhao Gao and Song Wang and Hayden Kwok-Hay So",
  title =        "A Reconfigurable Architecture for Real-time
                 Event-based Multi-Object Tracking",
  journal =      j-TRETS,
  volume =       "16",
  number =       "4",
  pages =        "58:1--58:??",
  month =        dec,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3593587",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Fri Dec 22 06:11:49 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3593587",
  abstract =     "Although advances in event-based machine vision
                 algorithms have demonstrated unparalleled capabilities
                 in performing some of the most demanding tasks, their
                 implementations under stringent real-time and power
                 constraints in edge systems remain a major \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "58",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Guo:2023:RAP,
  author =       "Licheng Guo and Pongstorn Maidee and Yun Zhou and
                 Chris Lavin and Eddie Hung and Wuxi Li and Jason Lau
                 and Weikang Qiao and Yuze Chi and Linghao Song and
                 Yuanlong Xiao and Alireza Kaviani and Zhiru Zhang and
                 Jason Cong",
  title =        "{RapidStream 2.0}: Automated Parallel Implementation
                 of Latency-Insensitive {FPGA} Designs Through Partial
                 Reconfiguration",
  journal =      j-TRETS,
  volume =       "16",
  number =       "4",
  pages =        "59:1--59:??",
  month =        dec,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3593025",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Fri Dec 22 06:11:49 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3593025",
  abstract =     "Field-programmable gate arrays (FPGAs) require a much
                 longer compilation cycle than conventional computing
                 platforms such as CPUs. In this article, we shorten the
                 overall compilation time by co-optimizing the HLS
                 compilation (C-to-RTL) and the back-end \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "59",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Nechi:2023:FBD,
  author =       "Anouar Nechi and Lukas Groth and Saleh Mulhem and
                 Farhad Merchant and Rainer Buchty and Mladen
                 Berekovic",
  title =        "{FPGA}-based Deep Learning Inference Accelerators:
                 Where Are We Standing?",
  journal =      j-TRETS,
  volume =       "16",
  number =       "4",
  pages =        "60:1--60:??",
  month =        dec,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3613963",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Fri Dec 22 06:11:49 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3613963",
  abstract =     "Recently, artificial intelligence applications have
                 become part of almost all emerging technologies around
                 us. Neural networks, in particular, have shown
                 significant advantages and have been widely adopted
                 over other approaches in machine learning. In
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "60",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Leipnitz:2023:CAM,
  author =       "Marcos T. Leipnitz and Gabriel L. Nazar",
  title =        "Constraint-Aware Multi-Technique Approximate
                 High-Level Synthesis for {FPGAs}",
  journal =      j-TRETS,
  volume =       "16",
  number =       "4",
  pages =        "61:1--61:??",
  month =        dec,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3624481",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Fri Dec 22 06:11:49 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3624481",
  abstract =     "Numerous approximate computing (AC) techniques have
                 been developed to reduce the design costs in
                 error-resilient application domains, such as signal and
                 multimedia processing, data mining, machine learning,
                 and computer vision, to trade-off computation
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "61",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Liu:2023:CKC,
  author =       "Kenneth Liu and Alec Lu and Kartik Samtani and Zhenman
                 Fang and Licheng Guo",
  title =        "{CHIP-KNNv2}: a Configurable and High-Performance
                 {$K$}-Nearest Neighbors Accelerator on {HBM}-based
                 {FPGAs}",
  journal =      j-TRETS,
  volume =       "16",
  number =       "4",
  pages =        "62:1--62:??",
  month =        dec,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3616873",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Fri Dec 22 06:11:49 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3616873",
  abstract =     "The k-nearest neighbors (KNN) algorithm is an
                 essential algorithm in many applications, such as
                 similarity search, image classification, and database
                 query. With the rapid growth in the dataset size and
                 the feature dimension of each data point, processing
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "62",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Guo:2023:TST,
  author =       "Licheng Guo and Yuze Chi and Jason Lau and Linghao
                 Song and Xingyu Tian and Moazin Khatti and Weikang Qiao
                 and Jie Wang and Ecenur Ustun and Zhenman Fang and
                 Zhiru Zhang and Jason Cong",
  title =        "{TAPA}: a Scalable Task-parallel Dataflow Programming
                 Framework for Modern {FPGAs} with Co-optimization of
                 {HLS} and Physical Design",
  journal =      j-TRETS,
  volume =       "16",
  number =       "4",
  pages =        "63:1--63:??",
  month =        dec,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3609335",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Fri Dec 22 06:11:49 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3609335",
  abstract =     "In this article, we propose TAPA, an end-to-end
                 framework that compiles a C++ task-parallel dataflow
                 program into a high-frequency FPGA accelerator.
                 Compared to existing solutions, TAPA has two major
                 advantages. First, TAPA provides a set of convenient
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "63",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Lu:2023:HET,
  author =       "Yingchun Lu and Yun Yang and Rong Hu and Huaguo Liang
                 and Maoxiang Yi and Huang Zhengfeng and Yuanming Ma and
                 Tian Chen and Liang Yao",
  title =        "High-efficiency {TRNG} Design Based on Multi-bit
                 Dual-ring Oscillator",
  journal =      j-TRETS,
  volume =       "16",
  number =       "4",
  pages =        "64:1--64:??",
  month =        dec,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3624991",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Fri Dec 22 06:11:49 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/prng.bib;
                 https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3624991",
  abstract =     "Unpredictable true random numbers are required in
                 security technology fields such as information
                 encryption, key generation, mask generation for
                 anti-side-channel analysis, algorithm initialization,
                 and so on. At present, the true random number generator
                 (TRNG) is not enough to provide fast random bits by
                 low-speed bits generation. Therefore, it is necessary
                 to design a faster TRNG. This work presents an
                 ultra-compact TRNG with high throughput based on a
                 novel extendable dual-ring oscillator (DRO). Owing to
                 multiple bits output per cycle in DRO can be used to
                 obtain the original random sequence, the proposed DRO
                 achieves a maximum resource utilization to build a more
                 efficient TRNG, compared with the conventional TRNG
                 system based on ring oscillator (RO), which only has a
                 single output and needs to build multiple groups of
                 ring oscillators. TRNG based on the 2-bit DRO and its
                 8-bit derivative structure has been verified on Xilinx
                 Artix-7 and Kintex-7 FPGA under the automatic layout
                 and routing and has achieved a throughput of 550 Mbps
                 and 1,100 Mbps, respectively. Moreover, in terms of
                 throughput performance over operating frequency,
                 hardware consumption, and entropy, the proposed scheme
                 has obvious advantages. Finally, the generated
                 sequences show good randomness in the test of NIST
                 SP800-22 and Dieharder test suite and pass the entropy
                 estimation test kit NIST SP800-90B and AIS-31.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "64",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Anupreetham:2024:HTF,
  author =       "Anupreetham Anupreetham and Mohamed Ibrahim and Mathew
                 Hall and Andrew Boutros and Ajay Kuzhively and Abinash
                 Mohanty and Eriko Nurvitadhi and Vaughn Betz and Yu Cao
                 and Jae-Sun Seo",
  title =        "High Throughput {FPGA}-Based Object Detection via
                 Algorithm-Hardware Co-Design",
  journal =      j-TRETS,
  volume =       "17",
  number =       "1",
  pages =        "1:1--1:??",
  month =        mar,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3634919",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Wed Mar 20 07:25:09 MDT 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3634919",
  abstract =     "Object detection and classification is a key task in
                 many computer vision applications such as smart
                 surveillance and autonomous vehicles. Recent advances
                 in deep learning have significantly improved the
                 quality of results achieved by these systems,
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "1",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Fan:2024:HDF,
  author =       "Zimeng Fan and Wei Hu and Fang Liu and Dian Xu and
                 Hong Guo and Yanxiang He and Min Peng",
  title =        "A Hardware Design Framework for Computer Vision Models
                 Based on Reconfigurable Devices",
  journal =      j-TRETS,
  volume =       "17",
  number =       "1",
  pages =        "2:1--2:??",
  month =        mar,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3635157",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Wed Mar 20 07:25:09 MDT 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3635157",
  abstract =     "In computer vision, the joint development of the
                 algorithm and computing dimensions cannot be separated.
                 Models and algorithms are constantly evolving, while
                 hardware designs must adapt to new or updated
                 algorithms. Reconfigurable devices are recognized
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "2",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Maschi:2024:SHS,
  author =       "Fabio Maschi and Gustavo Alonso",
  title =        "{Strega}: an {HTTP} Server for {FPGAs}",
  journal =      j-TRETS,
  volume =       "17",
  number =       "1",
  pages =        "3:1--3:??",
  month =        mar,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3611312",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Wed Mar 20 07:25:09 MDT 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3611312",
  abstract =     "The computer architecture landscape is being reshaped
                 by the new opportunities, challenges, and constraints
                 brought by the cloud. On the one hand, high-level
                 applications profit from specialised hardware to boost
                 their performance and reduce deployment \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "3",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Qiu:2024:FFD,
  author =       "Yunhui Qiu and Yiqing Mao and Xuchen Gao and Sichao
                 Chen and Jiangnan Li and Wenbo Yin and Lingli Wang",
  title =        "{FDRA}: a Framework for a Dynamically Reconfigurable
                 Accelerator Supporting Multi-Level Parallelism",
  journal =      j-TRETS,
  volume =       "17",
  number =       "1",
  pages =        "4:1--4:??",
  month =        mar,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3614224",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Wed Mar 20 07:25:09 MDT 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3614224",
  abstract =     "Coarse-grained reconfigurable architectures (CGRAs)
                 have emerged as promising accelerators due to their
                 high flexibility and energy efficiency. However,
                 existing open source works often lack integration of
                 CGRAs with CPU systems and corresponding \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "4",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Kalomiros:2024:HAS,
  author =       "John Kalomiros and John Vourvoulakis and Stavros
                 Vologiannidis",
  title =        "A Hardware Accelerator for the Semi-Global Matching
                 Stereo Algorithm: an Efficient Implementation for the
                 {Stratix V} and {Zynq UltraScale+} {FPGA} Technology",
  journal =      j-TRETS,
  volume =       "17",
  number =       "1",
  pages =        "5:1--5:??",
  month =        mar,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3615869",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Wed Mar 20 07:25:09 MDT 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3615869",
  abstract =     "The semi-global matching stereo algorithm is a top
                 performing algorithm in stereo vision. The recursive
                 nature of the computations involved in this algorithm
                 introduces an inherent data dependency problem,
                 hindering the progressive computations of \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "5",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Reis:2024:DDL,
  author =       "Miguel Reis and M{\'a}rio V{\'e}stias and Hor{\'a}cio
                 Neto",
  title =        "Designing Deep Learning Models on {FPGA} with Multiple
                 Heterogeneous Engines",
  journal =      j-TRETS,
  volume =       "17",
  number =       "1",
  pages =        "6:1--6:??",
  month =        mar,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3615870",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Wed Mar 20 07:25:09 MDT 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3615870",
  abstract =     "Deep learning models are becoming more complex and
                 heterogeneous with new layer types to improve their
                 accuracy. This brings a considerable challenge to the
                 designers of accelerators of deep neural networks.
                 There have been several architectures and \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "6",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{FaoDeMoura:2024:RNL,
  author =       "Rafael {F{\~a}o De Moura} and Luigi Carro",
  title =        "Reprogrammable Non-Linear Circuits Using {ReRAM} for
                 {NN} Accelerators",
  journal =      j-TRETS,
  volume =       "17",
  number =       "1",
  pages =        "7:1--7:??",
  month =        mar,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3617894",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Wed Mar 20 07:25:09 MDT 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3617894",
  abstract =     "As the massive usage of artificial intelligence
                 techniques spreads in the economy, researchers are
                 exploring new techniques to reduce the energy
                 consumption of Neural Network (NN) applications,
                 especially as the complexity of NNs continues to
                 increase. \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "7",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Honorat:2024:ABS,
  author =       "Alexandre Honorat and Micka{\"e}l Dardaillon and Hugo
                 Miomandre and Jean-Fran{\c{c}}ois Nezan",
  title =        "Automated Buffer Sizing of Dataflow Applications in a
                 High-level Synthesis Workflow",
  journal =      j-TRETS,
  volume =       "17",
  number =       "1",
  pages =        "8:1--8:??",
  month =        mar,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3626103",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Wed Mar 20 07:25:09 MDT 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3626103",
  abstract =     "High-Level Synthesis (HLS) tools are mature enough to
                 provide efficient code generation for computation
                 kernels on FPGA hardware. For more complex
                 applications, multiple kernels may be connected by a
                 dataflow graph. Although some tools, such as Xilinx
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "8",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Noyez:2024:MMS,
  author =       "Louis Noyez and Nadia {El Mrabet} and Olivier Potin
                 and Pascal Veron",
  title =        "{Montgomery} Multiplication Scalable Systolic Designs
                 Optimized for {DSP48E2}",
  journal =      j-TRETS,
  volume =       "17",
  number =       "1",
  pages =        "9:1--9:??",
  month =        mar,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3624571",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Wed Mar 20 07:25:09 MDT 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3624571",
  abstract =     "This article describes an extensive study of the use
                 of DSP48E2 Slices in Ultrascale FPGAs to design
                 hardware versions of the Montgomery Multiplication
                 algorithm for the hardware acceleration of modular
                 multiplications. Our fully scalable systolic \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "9",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Soleimani:2024:PCA,
  author =       "Parastoo Soleimani and David W. Capson and Kin Fun
                 Li",
  title =        "A Partitioned {CAM} Architecture with {FPGA}
                 Acceleration for Binary Descriptor Matching",
  journal =      j-TRETS,
  volume =       "17",
  number =       "1",
  pages =        "10:1--10:??",
  month =        mar,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3624749",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Wed Mar 20 07:25:09 MDT 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3624749",
  abstract =     "An efficient architecture for image descriptor
                 matching that uses a partitioned content-addressable
                 memory (CAM)-based approach is proposed. CAM is
                 frequently used in high-speed content-matching
                 applications. However, due to its lack of functionality
                 to \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "10",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Weng:2024:TAS,
  author =       "Olivia Weng and Gabriel Marcano and Vladimir Loncar
                 and Alireza Khodamoradi and Abarajithan G. and Nojan
                 Sheybani and Andres Meza and Farinaz Koushanfar and
                 Kristof Denolf and Javier Mauricio Duarte and Ryan
                 Kastner",
  title =        "{Tailor}: Altering Skip Connections for
                 Resource-Efficient Inference",
  journal =      j-TRETS,
  volume =       "17",
  number =       "1",
  pages =        "11:1--11:??",
  month =        mar,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3624990",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Wed Mar 20 07:25:09 MDT 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3624990",
  abstract =     "Deep neural networks use skip connections to improve
                 training convergence. However, these skip connections
                 are costly in hardware, requiring extra buffers and
                 increasing on- and off-chip memory utilization and
                 bandwidth requirements. In this article, we \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "11",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Hasler:2024:PAS,
  author =       "Jennifer Hasler and Cong Hao",
  title =        "Programmable Analog System Benchmarks Leading to
                 Efficient Analog Computation Synthesis",
  journal =      j-TRETS,
  volume =       "17",
  number =       "1",
  pages =        "12:1--12:??",
  month =        mar,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3625298",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Wed Mar 20 07:25:09 MDT 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3625298",
  abstract =     "This effort develops the first rich suite of analog
                 and mixed-signal benchmark of various sizes and
                 domains, intended for use with contemporary analog and
                 mixed-signal designs and synthesis tools. Benchmarking
                 enables analog-digital co-design exploration \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "12",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Gohringer:2024:IFS,
  author =       "Diana G{\"o}hringer and Georgios Keramidas and Akash
                 Kumar",
  title =        "Introduction to the {FPL 2021} Special Section",
  journal =      j-TRETS,
  volume =       "17",
  number =       "1",
  pages =        "13:1--13:??",
  month =        mar,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3635115",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Wed Mar 20 07:25:09 MDT 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3635115",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "13",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Nikolic:2024:EFS,
  author =       "Stefan Nikoli{\'c} and Paolo Ienne",
  title =        "Exploring {FPGA} Switch-Blocks without Explicitly
                 Listing Connectivity Patterns",
  journal =      j-TRETS,
  volume =       "17",
  number =       "1",
  pages =        "14:1--14:??",
  month =        mar,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3597417",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Wed Mar 20 07:25:09 MDT 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3597417",
  abstract =     "Increased lower metal resistance makes physical
                 aspects of Field-Programmable Gate Array (FPGA)
                 switch-blocks more relevant than before. The need to
                 navigate a design space where each individual switch
                 can have significant impact on the FPGA's performance
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "14",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Liu:2024:EFB,
  author =       "Zhengyan Liu and Qiang Liu and Shun Yan and Ray C. C.
                 Cheung",
  title =        "An Efficient {FPGA}-based Depthwise Separable
                 Convolutional Neural Network Accelerator with Hardware
                 Pruning",
  journal =      j-TRETS,
  volume =       "17",
  number =       "1",
  pages =        "15:1--15:??",
  month =        mar,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3615661",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Wed Mar 20 07:25:09 MDT 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3615661",
  abstract =     "Convolutional neural networks (CNNs) have been widely
                 deployed in computer vision tasks. However, the
                 computation and resource intensive characteristics of
                 CNN bring obstacles to its application on embedded
                 systems. This article proposes an efficient \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "15",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Chen:2024:EVL,
  author =       "Jeffrey Chen and Sang-Woo Jun and Sehwan Hong and
                 Warrick He and Jinyeong Moon",
  title =        "{Eciton}: Very Low-power Recurrent Neural Network
                 Accelerator for Real-time Inference at the Edge",
  journal =      j-TRETS,
  volume =       "17",
  number =       "1",
  pages =        "16:1--16:??",
  month =        mar,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3629979",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Wed Mar 20 07:25:09 MDT 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3629979",
  abstract =     "This article presents Eciton, a very low-power
                 recurrent neural network accelerator for time series
                 data within low-power edge sensor nodes, achieving
                 real-time inference with a power consumption of 17 mW
                 under load. Eciton reduces memory and chip \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "16",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Sani:2024:EIU,
  author =       "Sajjad Rostami Sani and Andy Ye",
  title =        "Evaluating the Impact of Using Multiple-Metal Layers
                 on the Layout Area of Switch Blocks for Tile-Based
                 {FPGAs} in {FinFET} 7nm",
  journal =      j-TRETS,
  volume =       "17",
  number =       "1",
  pages =        "17:1--17:??",
  month =        mar,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3639055",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Wed Mar 20 07:25:09 MDT 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3639055",
  abstract =     "A new area model for estimating the layout area of
                 switch blocks is introduced in this work. The model is
                 based on a realistic layout strategy. As a result, it
                 not only takes into consideration the active area that
                 is needed to construct a switch block \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "17",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Li:2024:ADC,
  author =       "Yonggen Li and Xin Li and Haibin Shen and Jicong Fan
                 and Yanfeng Xu and Kejie Huang",
  title =        "An All-digital Compute-in-memory {FPGA} Architecture
                 for Deep Learning Acceleration",
  journal =      j-TRETS,
  volume =       "17",
  number =       "1",
  pages =        "18:1--18:??",
  month =        mar,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3640469",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Wed Mar 20 07:25:09 MDT 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3640469",
  abstract =     "Field Programmable Gate Array (FPGA) is a versatile
                 and programmable hardware platform, which makes it a
                 promising candidate for accelerating Deep Neural
                 Networks (DNNs). However, FPGA's computing energy
                 efficiency is low due to the domination of energy
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Reconfigurable Technol. Syst.",
  articleno =    "18",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}