Valid HTML 4.0! Valid CSS!
%%% -*-BibTeX-*-
%%% ====================================================================
%%%  BibTeX-file{
%%%     author          = "Nelson H. F. Beebe",
%%%     version         = "1.07",
%%%     date            = "02 June 2023",
%%%     time            = "12:34:38 MDT",
%%%     filename        = "risc-v.bib",
%%%     address         = "University of Utah
%%%                        Department of Mathematics, 110 LCB
%%%                        155 S 1400 E RM 233
%%%                        Salt Lake City, UT 84112-0090
%%%                        USA",
%%%     telephone       = "+1 801 581 5254",
%%%     FAX             = "+1 801 581 4148",
%%%     URL             = "http://www.math.utah.edu/~beebe",
%%%     checksum        = "53902 2588 13088 132345",
%%%     email           = "beebe at math.utah.edu, beebe at acm.org,
%%%                        beebe at computer.org (Internet)",
%%%     codetable       = "ISO/ASCII",
%%%     keywords        = "bibliography; BibTeX; open-source instruction
%%%                        set architecture; RISC-V; RISC-V32; RISC-V64",
%%%     license         = "public domain",
%%%     supported       = "yes",
%%%     docstring       = "This is a bibliography of publications about
%%%                        the RISC-V processor architecture, a recent
%%%                        instruction set design that is open source,
%%%                        and intended to receive wide use from small
%%%                        embedded systems, to desktop computers, and
%%%                        to high-performance computing servers.
%%%
%%%                        At version 1.07, the year coverage looked
%%%                        like this:
%%%
%%%                             2016 (   3)    2019 (  10)    2022 (  11)
%%%                             2017 (  11)    2020 (   7)    2023 (   8)
%%%                             2018 (   2)    2021 (   8)
%%%
%%%                             Article:         48
%%%                             Book:             1
%%%                             InProceedings:    3
%%%                             Misc:             5
%%%                             Proceedings:      3
%%%
%%%                             Total entries:   60
%%%
%%%                        The checksum field above contains a CRC-16
%%%                        checksum as the first value, followed by the
%%%                        equivalent of the standard UNIX wc (word
%%%                        count) utility output of lines, words, and
%%%                        characters.  This is produced by Robert
%%%                        Solovay's checksum utility.",
%%%  }
%%% ====================================================================
@Preamble{
    "\ifx \undefined \pkg       \def \pkg      #1{{{\tt #1}}} \fi"
}

%%% ====================================================================
%%% Acknowledgement abbreviations:
@String{ack-nhfb = "Nelson H. F. Beebe,
                    University of Utah,
                    Department of Mathematics, 110 LCB,
                    155 S 1400 E RM 233,
                    Salt Lake City, UT 84112-0090, USA,
                    Tel: +1 801 581 5254,
                    FAX: +1 801 581 4148,
                    e-mail: \path|beebe@math.utah.edu|,
                            \path|beebe@acm.org|,
                            \path|beebe@computer.org| (Internet),
                    URL: \path|http://www.math.utah.edu/~beebe/|"}

%%% ====================================================================
%%% Journal abbreviations:
@String{j-CACM                  = "Communications of the Association
                                  for Computing Machinery"}

@String{j-COMP-ARCH-NEWS = "ACM SIGARCH Computer Architecture News"}

@String{j-DTRAP                 = "Digital Threats: Research and Practice
                                  (DTRAP)"}

@String{j-FUT-GEN-COMP-SYS      = "Future Generation Computer Systems"}

@String{j-IEEE-COMPUT-ARCHIT-LETT = "IEEE Computer Architecture Letters"}

@String{j-IEEE-MICRO            = "IEEE Micro"}

@String{j-IEEE-SPECTRUM         = "IEEE Spectrum"}

@String{j-IEEE-TRANS-COMPUT     = "IEEE Transactions on Computers"}

@String{j-J-CRYPTO-ENG          = "Journal of Cryptographic Engineering"}

@String{j-JETC                  = "ACM Journal on Emerging Technologies
                                  in Computing Systems (JETC)"}

@String{j-OPER-SYS-REV          = "Operating Systems Review"}

@String{j-SIGPLAN               = "ACM SIG{\-}PLAN Notices"}

@String{j-SOFTWAREX             = "SoftwareX"}

@String{j-TACO                  = "ACM Transactions on Architecture and
                                  Code Optimization"}

@String{j-TECS                  = "ACM Transactions on Embedded Computing
                                  Systems"}

@String{j-TODAES                = "ACM Transactions on Design Automation of
                                   Electronic Systems"}

@String{j-TRETS                 = "ACM Transactions on Reconfigurable Technology
                                  and Systems (TRETS)"}

%%% ====================================================================
%%% Publishers and their addresses:
@String{pub-ACM                 = "ACM Press"}
@String{pub-ACM:adr             = "New York, NY 10036, USA"}

@String{pub-IEEE                = "IEEE Computer Society Press"}
@String{pub-IEEE:adr            = "1109 Spring Street, Suite 300, Silver
                                   Spring, MD 20910, USA"}

%%% ====================================================================
%%% Bibliography entries, sorted by year, and then by citation label,
%%% with "bibsort --byyear":
@Article{Kim:2016:SCD,
  author =       "Channoh Kim and Sungmin Kim and Hyeon Gyu Cho and
                 Dooyoung Kim and Jaehyeok Kim and Young H. Oh and
                 Hakbeom Jang and Jae W. Lee",
  title =        "Short-circuit dispatch: accelerating virtual machine
                 interpreters on embedded processors",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "44",
  number =       "3",
  pages =        "291--303",
  month =        jun,
  year =         "2016",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3007787.3001168",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Thu Jan 12 18:43:43 MST 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/risc-v.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigarch.bib;
                 http://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "Interpreters are widely used to implement high-level
                 language virtual machines (VMs), especially on
                 resource-constrained embedded platforms. Many scripting
                 languages employ interpreter-based VMs for their
                 advantages over native code compilers, such as
                 portability, smaller resource footprint, and compact
                 codes. For efficient interpretation a script (program)
                 is first compiled into an intermediate representation,
                 or bytecodes. The canonical interpreter then runs an
                 infinite loop that fetches, decodes, and executes one
                 bytecode at a time. This bytecode dispatch loop is a
                 well-known source of inefficiency, typically featuring
                 a large jump table with a hard-to-predict indirect
                 jump. Most existing techniques to optimize this loop
                 focus on reducing the misprediction rate of this
                 indirect jump in both hardware and software. However,
                 these techniques are much less effective on embedded
                 processors with shallow pipelines and low IPCs.
                 Instead, we tackle another source of inefficiency more
                 prominent on embedded platforms--redundant computation
                 in the dispatch loop. To this end, we propose
                 Short-Circuit Dispatch (SCD), a low-cost architectural
                 extension that enables fast, hardware-based bytecode
                 dispatch with fewer instructions. The key idea of SCD
                 is to overlay the software-created bytecode jump table
                 on a branch target buffer (BTB). Once a bytecode is
                 fetched, the BTB is looked up using the bytecode,
                 instead of PC, as key. If it hits, the interpreter
                 directly jumps to the target address retrieved from the
                 BTB; otherwise, it goes through the original dispatch
                 path. This effectively eliminates redundant computation
                 in the dispatcher code for decode, bound check, and
                 target address calculation, thus significantly reducing
                 total instruction count. Our simulation results
                 demonstrate that SCD achieves geomean speedups of
                 19.9\% and 14.1\% for two production-grade script
                 interpreters for Lua and JavaScript, respectively.
                 Moreover, our fully synthesizable RTL design based on a
                 RISC-V embedded processor shows that SCD improves the
                 EDP of the Lua interpreter by 24.2\%, while increasing
                 the chip area by only 0.72\% at a 40nm technology
                 node.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
  remark =       "ISCA '16 conference proceedings.",
}

@Article{Lee:2016:AAB,
  author =       "Yunsup Lee and Andrew Waterman and Henry Cook and
                 Brian Zimmer and Ben Keller and Alberto Puggelli and
                 Jaehwa Kwak and Ruzica Jevtic and Stevo Bailey and
                 Milovan Blagojevic and Pi-Feng Chiu and Rimas Avizienis
                 and Brian Richards and Jonathan Bachrach and David
                 Patterson and Elad Alon and Bora Nikolic and Krste
                 Asanovic",
  title =        "An Agile Approach to Building {RISC-V}
                 Microprocessors",
  journal =      j-IEEE-MICRO,
  volume =       "36",
  number =       "2",
  pages =        "8--20",
  month =        mar # "\slash " # apr,
  year =         "2016",
  CODEN =        "IEMIDZ",
  DOI =          "https://doi.org/10.1109/MM.2016.11",
  ISSN =         "0272-1732 (print), 1937-4143 (electronic)",
  ISSN-L =       "0272-1732",
  bibdate =      "Tue Apr 19 06:31:19 MDT 2016",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/hot-chips.bib;
                 http://www.math.utah.edu/pub/tex/bib/ieeemicro.bib;
                 http://www.math.utah.edu/pub/tex/bib/risc-v.bib",
  URL =          "http://www.computer.org/csdl/mags/mi/2016/02/mmi2016020008-abs.html",
  abstract-URL = "http://www.computer.org/csdl/mags/mi/2016/02/mmi2016020008-abs.html",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://www.computer.org/csdl/mags/mi/index.html",
}

@Article{Tan:2016:NVC,
  author =       "Yong Kiam Tan and Magnus O. Myreen and Ramana Kumar
                 and Anthony Fox and Scott Owens and Michael Norrish",
  title =        "A new verified compiler backend for {CakeML}",
  journal =      j-SIGPLAN,
  volume =       "51",
  number =       "9",
  pages =        "60--73",
  month =        sep,
  year =         "2016",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3022670.2951924",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:13 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/risc-v.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "We have developed and mechanically verified a new
                 compiler backend for CakeML. Our new compiler features
                 a sequence of intermediate languages that allows it to
                 incrementally compile away high-level features and
                 enables verification at the right levels of semantic
                 detail. In this way, it resembles mainstream
                 (unverified) compilers for strict functional languages.
                 The compiler supports efficient curried multi-argument
                 functions, configurable data representations,
                 exceptions that unwind the call stack, register
                 allocation, and more. The compiler targets several
                 architectures: x86-64, ARMv6, ARMv8, MIPS-64, and
                 RISC-V. In this paper, we present the overall structure
                 of the compiler, including its 12 intermediate
                 languages, and explain how everything fits together. We
                 focus particularly on the interaction between the
                 verification of the register allocator and the garbage
                 collector, and memory representations. The entire
                 development has been carried out within the HOL4
                 theorem prover.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "ICFP '16 conference proceedings.",
}

@Article{Dietrich:2017:OVA,
  author =       "Christian Dietrich and Daniel Lohmann",
  title =        "{OSEK-V}: application-specific {RTOS} instantiation in
                 hardware",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "4",
  pages =        "111--120",
  month =        may,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3140582.3081030",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:15 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/risc-v.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "The employment of a real-time operating system (RTOS)
                 in an embedded control systems is often an
                 all-or-nothing decision: While the RTOS-abstractions
                 provide for easier software composition and
                 development, the price in terms of event latencies and
                 memory costs are high. Especially in HW/SW codesign
                 settings, system developers try to avoid the employment
                 of a full-blown RTOS as far as possible. In OSEK-V, we
                 mitigate this trade-off by a very aggressive tailoring
                 of the concrete RTOS instance into the hardware.
                 Instead of implementing generic OS components as custom
                 hardware devices, we capture the actually possible
                 application-kernel interactions as a finite-state
                 machine and integrate the tailored RTOS semantics
                 directly into the processor pipeline. In our
                 experimental results with an OSEK-based implementation
                 of a quadrotor flight controller into the Rocket/RISC-V
                 softcore, we thereby can significantly reduce event
                 latencies, interrupt lock times, and memory footprint
                 at moderate costs in terms of FPGA resources.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "LCTES '17 conference proceedings.",
}

@Article{Kapre:2017:HDR,
  author =       "Nachiket Kapre and Jan Gray",
  title =        "{Hoplite}: a Deflection-Routed Directional Torus {NoC}
                 for {FPGAs}",
  journal =      j-TRETS,
  volume =       "10",
  number =       "2",
  pages =        "14:1--14:??",
  month =        apr,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3027486",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sat Dec 23 10:23:01 MST 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/risc-v.bib;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "We can design an FPGA-optimized lightweight
                 network-on-chip (NoC) router for flit-oriented
                 packet-switched communication that is an order of
                 magnitude smaller (in terms of LUTs and FFs) than
                 state-of-the-art FPGA overlay routers available today.
                 We present Hoplite, an efficient, lightweight, and fast
                 FPGA overlay NoC that is designed to be small and
                 compact by (1) using deflection routing instead of
                 buffered switching to eliminate expensive FIFO buffers
                 and (2) using a torus topology to reduce the cost of
                 switch crossbar. Buffering and crossbar implementation
                 complexities have traditionally limited speeds and
                 imposed heavy resource costs in conventional FPGA
                 overlay NoCs. We take care to exploit the fracturable
                 lookup tables (LUT) organization of the FPGA to further
                 improve the resource efficiency of mapping the
                 expensive crossbar multiplexers. Hoplite can outperform
                 classic, bidirectional, buffered mesh networks for
                 single-flit-oriented FPGA applications by as much as $
                 1.5 \times $ (best achievable throughputs for a $ 10
                 \times 10 $ system) or $ 2.5 \times $ (allocating same
                 amount of FPGA resources to both NoCs) for uniform
                 random traffic. When compared to buffered mesh
                 switches, FPGA-based deflection routers are $ \approx
                 3.5 \times $ smaller (HLS-generated switch) and $ 2.5
                 \times $ faster (clock period) for 32b payloads. In a
                 separate experiment, we hand-crafted an RTL version of
                 our switch with location constraints that requires only
                 60 LUTs and 100 FFs per router and runs at 2.9ns. We
                 conduct additional layout experiments on modern Xilinx
                 and Altera FPGAs and demonstrate wide-channel
                 chip-spanning layouts that run in excess of 300MHz
                 while consuming 10--15\% of overall chip resources. We
                 also demonstrate a clustered RISC-V multiprocessor
                 organization that uses Hoplite to help deliver the high
                 processing throughputs of the FPGA architecture to user
                 applications.",
  acknowledgement = ack-nhfb,
  articleno =    "14",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Kim:2017:TAAa,
  author =       "Channoh Kim and Jaehyeok Kim and Sungmin Kim and
                 Dooyoung Kim and Namho Kim and Gitae Na and Young H. Oh
                 and Hyeon Gyu Cho and Jae W. Lee",
  title =        "Typed Architectures: Architectural Support for
                 Lightweight Scripting",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "1",
  pages =        "77--90",
  month =        mar,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3093337.3037726",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Jun 5 18:01:58 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/risc-v.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Dynamic scripting languages are becoming more and more
                 widely adopted not only for fast prototyping but also
                 for developing production-grade applications. They
                 provide high-productivity programming environments
                 featuring high levels of abstraction with powerful
                 built-in functions, automatic memory management,
                 object-oriented programming paradigm and dynamic
                 typing. However, their flexible, dynamic type systems
                 easily become the source of inefficiency in terms of
                 instruction count, memory footprint, and energy
                 consumption. This overhead makes it challenging to
                 deploy these high-productivity programming technologies
                 on emerging single-board computers for IoT
                 applications. Addressing this challenge, this paper
                 introduces Typed Architectures, a high-efficiency,
                 low-cost execution substrate for dynamic scripting
                 languages, where each data variable retains high-level
                 type information at an ISA level. Typed Architectures
                 calculate and check the dynamic type of each variable
                 implicitly in hardware, rather than explicitly in
                 software, hence significantly reducing instruction
                 count for dynamic type checking. Besides, Typed
                 Architectures introduce polymorphic instructions (e.g.,
                 xadd), which are bound to the correct native
                 instruction at runtime within the pipeline (e.g., add
                 or fadd) to efficiently implement polymorphic
                 operators. Finally, Typed Architectures provide
                 hardware support for flexible yet efficient type tag
                 extraction and insertion, capturing common data layout
                 patterns of tag-value pairs. Our evaluation using a
                 fully synthesizable RISC-V RTL design on FPGA shows
                 that Typed Architectures achieve geomean speedups of
                 11.2\% and 9.9\% with maximum speedups of 32.6\% and
                 43.5\% for two production-grade scripting engines for
                 JavaScript and Lua, respectively. Moreover, Typed
                 Architectures improve the energy-delay product (EDP) by
                 19.3\% for JavaScript and 16.5\% for Lua with an area
                 overhead of 1.6\% at a 40nm technology node.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
  remark =       "ASPLOS'17 conference proceedings",
}

@Article{Kim:2017:TAAb,
  author =       "Channoh Kim and Jaehyeok Kim and Sungmin Kim and
                 Dooyoung Kim and Namho Kim and Gitae Na and Young H. Oh
                 and Hyeon Gyu Cho and Jae W. Lee",
  title =        "Typed Architectures: Architectural Support for
                 Lightweight Scripting",
  journal =      j-OPER-SYS-REV,
  volume =       "51",
  number =       "2",
  pages =        "77--90",
  month =        jun,
  year =         "2017",
  CODEN =        "OSRED8",
  DOI =          "https://doi.org/10.1145/3093315.3037726",
  ISSN =         "0163-5980 (print), 1943-586X (electronic)",
  ISSN-L =       "0163-5980",
  bibdate =      "Mon Jul 24 18:36:23 MDT 2017",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/opersysrev.bib;
                 http://www.math.utah.edu/pub/tex/bib/risc-v.bib",
  abstract =     "Dynamic scripting languages are becoming more and more
                 widely adopted not only for fast prototyping but also
                 for developing production-grade applications. They
                 provide high-productivity programming environments
                 featuring high levels of abstraction with powerful
                 built-in functions, automatic memory management,
                 object-oriented programming paradigm and dynamic
                 typing. However, their flexible, dynamic type systems
                 easily become the source of inefficiency in terms of
                 instruction count, memory footprint, and energy
                 consumption. This overhead makes it challenging to
                 deploy these high-productivity programming technologies
                 on emerging single-board computers for IoT
                 applications. Addressing this challenge, this paper
                 introduces Typed Architectures, a high-efficiency,
                 low-cost execution substrate for dynamic scripting
                 languages, where each data variable retains high-level
                 type information at an ISA level. Typed Architectures
                 calculate and check the dynamic type of each variable
                 implicitly in hardware, rather than explicitly in
                 software, hence significantly reducing instruction
                 count for dynamic type checking. Besides, Typed
                 Architectures introduce polymorphic instructions (e.g.,
                 xadd), which are bound to the correct native
                 instruction at runtime within the pipeline (e.g., add
                 or fadd) to efficiently implement polymorphic
                 operators. Finally, Typed Architectures provide
                 hardware support for flexible yet efficient type tag
                 extraction and insertion, capturing common data layout
                 patterns of tag-value pairs. Our evaluation using a
                 fully synthesizable RISC-V RTL design on FPGA shows
                 that Typed Architectures achieve geomean speedups of
                 11.2\% and 9.9\% with maximum speedups of 32.6\% and
                 43.5\% for two production-grade scripting engines for
                 JavaScript and Lua, respectively. Moreover, Typed
                 Architectures improve the energy-delay product (EDP) by
                 19.3\% for JavaScript and 16.5\% for Lua with an area
                 overhead of 1.6\% at a 40nm technology node.",
  acknowledgement = ack-nhfb,
  fjournal =     "Operating Systems Review",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J597",
}

@Article{Kim:2017:TAAc,
  author =       "Channoh Kim and Jaehyeok Kim and Sungmin Kim and
                 Dooyoung Kim and Namho Kim and Gitae Na and Young H. Oh
                 and Hyeon Gyu Cho and Jae W. Lee",
  title =        "Typed Architectures: Architectural Support for
                 Lightweight Scripting",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "4",
  pages =        "77--90",
  month =        apr,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093336.3037726",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:16 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/risc-v.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Dynamic scripting languages are becoming more and more
                 widely adopted not only for fast prototyping but also
                 for developing production-grade applications. They
                 provide high-productivity programming environments
                 featuring high levels of abstraction with powerful
                 built-in functions, automatic memory management,
                 object-oriented programming paradigm and dynamic
                 typing. However, their flexible, dynamic type systems
                 easily become the source of inefficiency in terms of
                 instruction count, memory footprint, and energy
                 consumption. This overhead makes it challenging to
                 deploy these high-productivity programming technologies
                 on emerging single-board computers for IoT
                 applications. Addressing this challenge, this paper
                 introduces Typed Architectures, a high-efficiency,
                 low-cost execution substrate for dynamic scripting
                 languages, where each data variable retains high-level
                 type information at an ISA level. Typed Architectures
                 calculate and check the dynamic type of each variable
                 implicitly in hardware, rather than explicitly in
                 software, hence significantly reducing instruction
                 count for dynamic type checking. Besides, Typed
                 Architectures introduce polymorphic instructions (e.g.,
                 xadd), which are bound to the correct native
                 instruction at runtime within the pipeline (e.g., add
                 or fadd) to efficiently implement polymorphic
                 operators. Finally, Typed Architectures provide
                 hardware support for flexible yet efficient type tag
                 extraction and insertion, capturing common data layout
                 patterns of tag-value pairs. Our evaluation using a
                 fully synthesizable RISC-V RTL design on FPGA shows
                 that Typed Architectures achieve geomean speedups of
                 11.2\% and 9.9\% with maximum speedups of 32.6\% and
                 43.5\% for two production-grade scripting engines for
                 JavaScript and Lua, respectively. Moreover, Typed
                 Architectures improve the energy-delay product (EDP) by
                 19.3\% for JavaScript and 16.5\% for Lua with an area
                 overhead of 1.6\% at a 40nm technology node.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "ASPLOS '17 conference proceedings.",
}

@InProceedings{Koenig:2017:HAC,
  author =       "Jack Koenig and David Biancolin and Jonathan Bachrach
                 and Krste Asanovic",
  title =        "A Hardware Accelerator for Computing an Exact Dot
                 Product",
  crossref =     "Burgess:2017:ISC",
  pages =        "114--121",
  month =        jul,
  year =         "2017",
  DOI =          "https://doi.org/10.1109/ARITH.2017.38",
  ISSN =         "1063-6889",
  bibdate =      "Fri Nov 17 09:10:14 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/fparith.bib;
                 http://www.math.utah.edu/pub/tex/bib/risc-v.bib",
  abstract =     "We study the implementation of a hardware accelerator
                 that computes a dot product of IEEE-754 floating-point
                 numbers exactly. The accelerator uses a wide (640 or
                 4288 bits for single or double-precision respectively)
                 fixed-point representation into which intermediate
                 floating-point products are accumulated. We designed
                 the accelerator as a generator in Chisel, which can
                 synthesize various configurations of the accelerator
                 that make different area-performance trade-offs. We
                 integrated eight different configurations into an SoC
                 comprised of RISC-V in-order scalar core, split L1
                 instruction and data caches, and unified L2 cache. In a
                 TSMC 45 nm technology, the accelerator area ranges from
                 0.05 mm2 to 0.32 mm2, and all configurations could be
                 clocked at frequencies in excess of 900MHz. The
                 accelerator successfully saturates the SoC's memory
                 system, achieving the same per-element efficiency (1
                 cycle-per-element) as Intel MKL running on an x86
                 machine with a similar cache configuration.",
  acknowledgement = ack-nhfb,
  keywords =     "accurate floating-point dot product; accurate
                 floating-point summation; area-performance trade-offs;
                 Bandwidth; cache configuration; cache storage; Chisel;
                 Coprocessors; data caches; exact dot product; fixed
                 point arithmetic; fixed-point representation; floating
                 point arithmetic; Generators; Hardware; hardware
                 accelerator; IEEE-754 floating-point numbers; Intel
                 MKL; intermediate floating-point products;
                 Microarchitecture; Registers; RISC-V in-order scalar
                 core; Rockets; size 45 nm; SoC memory system; split L1
                 instruction; system-on-chip; TSMC technology; unified
                 L2 cache",
}

@Book{Patterson:2017:RVR,
  author =       "David Patterson and Andrew Waterman",
  title =        "The {RISC-V} Reader: An Open Architecture Atlas",
  publisher =    "Strawberry Canyon",
  address =      "San Francisco, CA, USA",
  pages =        "xiv + 180",
  year =         "2017",
  ISBN =         "0-9992491-1-8",
  ISBN-13 =      "978-0-9992491-1-6",
  LCCN =         "QA76.9.A73 P388 2017",
  bibdate =      "Mon Nov 18 18:47:27 MST 2019",
  bibsource =    "fsz3950.oclc.org:210/WorldCat;
                 http://www.math.utah.edu/pub/tex/bib/master.bib;
                 http://www.math.utah.edu/pub/tex/bib/risc-v.bib",
  acknowledgement = ack-nhfb,
  shorttableofcontents = "RISC-V Reference Card / i \\
                 List of Figures / ix \\
                 Preface / xii \\
                 1 Why RISC-V? / 2 \\
                 2 RV32I: RISC-V Base Integer ISA / 14 \\
                 3 RISC-V Assembly Language / 32 \\
                 4 RV32M: Multiply and Divide / / 44 \\
                 5 RV32FD: Single/Double Floating Point / 48 \\
                 6 RV32A: Atomic / / 60 \\
                 7 RV32C: Compressed Instructions / 64 \\
                 8 RV32V: Vector / 72 \\
                 9 RV64: 64-bit Address Instructions / / 86 \\
                 10 RV32/64 Privileged Architecture / 100 \\
                 11 Future RISC-V Optional Extensions / 118 \\
                 Appendix A: RISC-V Instruction Listings / 120 \\
                 Appendix B: Transliteration from RISC-V / 168 \\
                 Index / 174",
  tableofcontents = "List of Figures / x \\
                 Preface / xii \\
                 1 Why RISC-V? / 2 \\
                 1.1 Introduction / 2 \\
                 1.2 Modular vs. Incremental ISAs / 4 \\
                 1.3 ISA Design 101 / 5 \\
                 1.4 An Overview of this Book / 10 \\
                 1.5 Concluding Remarks / 11 \\
                 1.6 To Learn More / 12 \\
                 2 RV32I: RISC-V Base Integer ISA / 14 \\
                 2.1 Introduction / 14 \\
                 2.2 RV32I Instruction formats / 14 \\
                 2.3 RV32I Registers / 18 \\
                 2.4 RV32I Integer Computation. / 18 \\
                 2.5 RV32I Loads and Stores / 20 \\
                 2.6 RV32I Conditional Branch / 21 \\
                 2.7 RV32I Unconditional Jump / 22 \\
                 2.8 RV32I Miscellaneous / 23 \\
                 2.9 Comparing RV32I, ARM-32, MIPS-32, and x86-32 / 23
                 \\
                 2.10 Concluding Remarks / 24 \\
                 2.11 To Learn More / 26 \\
                 3 RISC-V Assembly Language / 32 \\
                 3.1 Introduction / 32 \\
                 3.2 Calling convention / 32 \\
                 3.3 Assembly / 35 \\
                 3.4 Linker / 40 \\
                 3.5 Static vs. Dynamic Linking / 41 \\
                 3.6 Loader / 42 \\
                 3.7 Concluding Remarks / 42 \\
                 3.8 To Learn More / 42 \\
                 4 RV32M: Multiply and Divide / 44 \\
                 4.1 Introduction / 44 \\
                 4.2 Concluding Remarks / 46 \\
                 4.3 To Learn More / 46 \\
                 5 RV32FD: Single/Double Floating Point / 48 \\
                 5.1 Introduction / 48 \\
                 5.2 Floating-Point Registers / 48 \\
                 5.3 Floating-Point Loads, Stores, and Arithmetic / 49
                 \\
                 5.4 Floating-Point Moves and Converts / 53 \\
                 5.5 Miscellaneous Floating-Point Instructions / 53 \\
                 5.6 Comparing RV32FD, ARM-32, MIPS-32, and x86-32 using
                 DAXPY / 55 \\
                 5.7 Concluding Remarks / 55 \\
                 5.8 To Learn More / 56 \\
                 6 RV32A: Atomic / 60 \\
                 6.1 Introduction / 60 \\
                 6.2 Concluding Remarks / 62 \\
                 6.3 To Learn More / 62 \\
                 7 RV32C: Compressed Instructions / 64 \\
                 7 1 Introduction / 64 \\
                 7.2 Comparing RV32GC, Thumb-2, microMIPS, and x86-32 /
                 66 \\
                 7.3 Concluding Remarks / 66 \\
                 7.4 To Learn More / 67 \\
                 8. RV32V: Vector / 72 \\
                 8.1 Introduction / 72 \\
                 8.2 Vector Computation Instructions / 73 \\
                 8.3 Vector Registers and Dynamic Typing / 74 \\
                 8.4 Vector Loads and Stores / 75 \\
                 8.5 Parallelism During Vector Execution / 76 \\
                 8.6 Conditional Execution of Vector Operations / 76 \\
                 8.7 Miscellaneous Vector Instructions / 77 \\
                 8.8 Vector Example: DAXPY in RV32V / 78 \\
                 8.9 Comparing RV32V, MIPS-32 MSA SIMD, and x86-32 AVX
                 SIMD / 79 \\
                 8.10 Concluding Remarks / 81 \\
                 8.11 To Learn More / 82 \\
                 9 RV64: 64-bit Address Instructions / 86 \\
                 9.1 Introduction / 86 \\
                 9.2 Comparison to Other 64-bit ISAs using Insertion
                 Sort / 90 \\
                 9.3 Program size / 92 \\
                 9.4 Concluding Remarks / 93 \\
                 9.5 To Learn More / 93 \\
                 10 RV32/64 Privileged Architecture / 100 \\
                 10.1 Introduction / 100 \\
                 10.2 Machine Mode for Simple Embedded Systems. / 101
                 \\
                 10.3 Machine-Mode Exception Handling / 103 \\
                 10.4 User Mode and Process Isolation in Embedded
                 Systems / 106 \\
                 10.5 Supervisor Mode for Modern Operating Systems / 108
                 \\
                 10.6 Page-Based Virtual Memory / 111 \\
                 10.7 Identification and Performance CSRs / 114 \\
                 10.8 Concluding Remarks / 115 \\
                 10.9 To Learn More / 117 \\
                 11 Future RISC-V Optional Extensions / 118 \\
                 11.1 ``B'' Standard Extension for Bit Manipulation /
                 118 \\
                 11.2 ``E'' Standard Extension for Embedded / 118 \\
                 11.3 ``H'' Privileged Architecture Extension for
                 Hypervisor Support / 118 \\
                 11.4 ``J'' Standard Extension for Dynamically
                 Translated Languages / 118 \\
                 11.5 ``L'' Standard Extension for Decimal
                 Floating-Point / 118 \\
                 11.6 ``N'' Standard Extension for User-Level Interrupts
                 / 119 \\
                 11.7 ``P'' Standard Extension for Packed-SIMD
                 Instructions / 119 \\
                 11.8 ``Q'' Standard Extension for Quad-Precision
                 Floating-Point / 119 \\
                 11.9 Concluding Remarks / 119 \\
                 A RISC-V Instruction Listings / 120 \\
                 B Transliteration from RISC-V / 168 \\
                 B.1 Introduction / 168 \\
                 B.2 Comparing RV32I, ARM-32, and x86-32 using Tree Sum
                 / 170 \\
                 B.3 Conclusion / 171 \\
                 Index / 174",
}

@Article{Trippel:2017:TMMa,
  author =       "Caroline Trippel and Yatin A. Manerkar and Daniel
                 Lustig and Michael Pellauer and Margaret Martonosi",
  title =        "{TriCheck}: Memory Model Verification at the
                 Trisection of Software, Hardware, and {ISA}",
  journal =      j-COMP-ARCH-NEWS,
  volume =       "45",
  number =       "1",
  pages =        "119--133",
  month =        mar,
  year =         "2017",
  CODEN =        "CANED2",
  DOI =          "https://doi.org/10.1145/3093337.3037719",
  ISSN =         "0163-5964 (print), 1943-5851 (electronic)",
  ISSN-L =       "0163-5964",
  bibdate =      "Mon Jun 5 18:01:58 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/risc-v.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigarch.bib",
  abstract =     "Memory consistency models (MCMs) which govern
                 inter-module interactions in a shared memory system,
                 are a significant, yet often under-appreciated, aspect
                 of system design. MCMs are defined at the various
                 layers of the hardware-software stack, requiring
                 thoroughly verified specifications, compilers, and
                 implementations at the interfaces between layers.
                 Current verification techniques evaluate segments of
                 the system stack in isolation, such as proving compiler
                 mappings from a high-level language (HLL) to an ISA or
                 proving validity of a microarchitectural implementation
                 of an ISA. This paper makes a case for full-stack MCM
                 verification and provides a toolflow, TriCheck, capable
                 of verifying that the HLL, compiler, ISA, and
                 implementation collectively uphold MCM requirements.
                 The work showcases TriCheck's ability to evaluate a
                 proposed ISA MCM in order to ensure that each layer and
                 each mapping is correct and complete. Specifically, we
                 apply TriCheck to the open source RISC-V ISA [55],
                 seeking to verify accurate, efficient, and legal
                 compilations from C11. We uncover under-specifications
                 and potential inefficiencies in the current RISC-V ISA
                 documentation and identify possible solutions for each.
                 As an example, we find that a RISC-V-compliant
                 microarchitecture allows 144 outcomes forbidden by C11
                 to be observed out of 1,701 litmus tests examined.
                 Overall, this paper demonstrates the necessity of
                 full-stack verification for detecting MCM-related bugs
                 in the hardware-software stack.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGARCH Computer Architecture News",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J89",
  remark =       "ASPLOS'17 conference proceedings",
}

@Article{Trippel:2017:TMMb,
  author =       "Caroline Trippel and Yatin A. Manerkar and Daniel
                 Lustig and Michael Pellauer and Margaret Martonosi",
  title =        "{TriCheck}: Memory Model Verification at the
                 Trisection of Software, Hardware, and {ISA}",
  journal =      j-OPER-SYS-REV,
  volume =       "51",
  number =       "2",
  pages =        "119--133",
  month =        jun,
  year =         "2017",
  CODEN =        "OSRED8",
  DOI =          "https://doi.org/10.1145/3093315.3037719",
  ISSN =         "0163-5980 (print), 1943-586X (electronic)",
  ISSN-L =       "0163-5980",
  bibdate =      "Mon Jul 24 18:36:23 MDT 2017",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/opersysrev.bib;
                 http://www.math.utah.edu/pub/tex/bib/risc-v.bib",
  abstract =     "Memory consistency models (MCMs) which govern
                 inter-module interactions in a shared memory system,
                 are a significant, yet often under-appreciated, aspect
                 of system design. MCMs are defined at the various
                 layers of the hardware-software stack, requiring
                 thoroughly verified specifications, compilers, and
                 implementations at the interfaces between layers.
                 Current verification techniques evaluate segments of
                 the system stack in isolation, such as proving compiler
                 mappings from a high-level language (HLL) to an ISA or
                 proving validity of a microarchitectural implementation
                 of an ISA. This paper makes a case for full-stack MCM
                 verification and provides a toolflow, TriCheck, capable
                 of verifying that the HLL, compiler, ISA, and
                 implementation collectively uphold MCM requirements.
                 The work showcases TriCheck's ability to evaluate a
                 proposed ISA MCM in order to ensure that each layer and
                 each mapping is correct and complete. Specifically, we
                 apply TriCheck to the open source RISC-V ISA [55],
                 seeking to verify accurate, efficient, and legal
                 compilations from C11. We uncover under-specifications
                 and potential inefficiencies in the current RISC-V ISA
                 documentation and identify possible solutions for each.
                 As an example, we find that a RISC-V-compliant
                 microarchitecture allows 144 outcomes forbidden by C11
                 to be observed out of 1,701 litmus tests examined.
                 Overall, this paper demonstrates the necessity of
                 full-stack verification for detecting MCM-related bugs
                 in the hardware-software stack.",
  acknowledgement = ack-nhfb,
  fjournal =     "Operating Systems Review",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J597",
}

@Article{Trippel:2017:TMMc,
  author =       "Caroline Trippel and Yatin A. Manerkar and Daniel
                 Lustig and Michael Pellauer and Margaret Martonosi",
  title =        "{TriCheck}: Memory Model Verification at the
                 Trisection of Software, Hardware, and {ISA}",
  journal =      j-SIGPLAN,
  volume =       "52",
  number =       "4",
  pages =        "119--133",
  month =        apr,
  year =         "2017",
  CODEN =        "SINODQ",
  DOI =          "https://doi.org/10.1145/3093336.3037719",
  ISSN =         "0362-1340 (print), 1523-2867 (print), 1558-1160
                 (electronic)",
  ISSN-L =       "0362-1340",
  bibdate =      "Sat Sep 16 10:18:16 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/risc-v.bib;
                 http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib",
  abstract =     "Memory consistency models (MCMs) which govern
                 inter-module interactions in a shared memory system,
                 are a significant, yet often under-appreciated, aspect
                 of system design. MCMs are defined at the various
                 layers of the hardware-software stack, requiring
                 thoroughly verified specifications, compilers, and
                 implementations at the interfaces between layers.
                 Current verification techniques evaluate segments of
                 the system stack in isolation, such as proving compiler
                 mappings from a high-level language (HLL) to an ISA or
                 proving validity of a microarchitectural implementation
                 of an ISA. This paper makes a case for full-stack MCM
                 verification and provides a toolflow, TriCheck, capable
                 of verifying that the HLL, compiler, ISA, and
                 implementation collectively uphold MCM requirements.
                 The work showcases TriCheck's ability to evaluate a
                 proposed ISA MCM in order to ensure that each layer and
                 each mapping is correct and complete. Specifically, we
                 apply TriCheck to the open source RISC-V ISA [55],
                 seeking to verify accurate, efficient, and legal
                 compilations from C11. We uncover under-specifications
                 and potential inefficiencies in the current RISC-V ISA
                 documentation and identify possible solutions for each.
                 As an example, we find that a RISC-V-compliant
                 microarchitecture allows 144 outcomes forbidden by C11
                 to be observed out of 1,701 litmus tests examined.
                 Overall, this paper demonstrates the necessity of
                 full-stack verification for detecting MCM-related bugs
                 in the hardware-software stack.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM SIGPLAN Notices",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J706",
  remark =       "ASPLOS '17 conference proceedings.",
}

@Article{Davidson:2018:COS,
  author =       "Scott Davidson and Shaolin Xie and Christopher Torng
                 and Khalid Al-Hawai and Austin Rovinski and Tutu Ajayi
                 and Luis Vega and Chun Zhao and Ritchie Zhao and Steve
                 Dai and Aporva Amarnath and Bandhav Veluri and Paul Gao
                 and Anuj Rao and Gai Liu and Rajesh K. Gupta and Zhiru
                 Zhang and Ronald Dreslinski and Christopher Batten and
                 Michael Bedford Taylor",
  title =        "The {Celerity} Open-Source 511-Core {RISC-V} Tiered
                 Accelerator Fabric: Fast Architectures and Design
                 Methodologies for Fast Chips",
  journal =      j-IEEE-MICRO,
  volume =       "38",
  number =       "2",
  pages =        "30--41",
  month =        mar # "\slash " # apr,
  year =         "2018",
  CODEN =        "IEMIDZ",
  DOI =          "https://doi.org/10.1109/MM.2018.022071133",
  ISSN =         "0272-1732 (print), 1937-4143 (electronic)",
  ISSN-L =       "0272-1732",
  bibdate =      "Sat Apr 28 13:18:45 MDT 2018",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/gnu.bib;
                 http://www.math.utah.edu/pub/tex/bib/hot-chips.bib;
                 http://www.math.utah.edu/pub/tex/bib/ieeemicro.bib;
                 http://www.math.utah.edu/pub/tex/bib/risc-v.bib",
  URL =          "https://www.computer.org/csdl/mags/mi/2018/02/mmi2018020030-abs.html",
  acknowledgement = ack-nhfb,
  journal-URL =  "http://www.computer.org/csdl/mags/mi/index.html",
}

@Article{Delshadtehrani:2018:NPM,
  author =       "Leila Delshadtehrani and Schuyler Eldridge and
                 Sadullah Canakci and Manuel Egele and Ajay Joshi",
  title =        "{Nile}: a Programmable Monitoring Coprocessor",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "17",
  number =       "1",
  pages =        "92--95",
  month =        jan # "\slash " # jun,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2017.2784416",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 http://www.math.utah.edu/pub/tex/bib/risc-v.bib",
  abstract =     "Researchers widely employ hardware performance
                 counters (HPCs) as well as debugging and profiling
                 tools in processors for monitoring different events
                 such as cache hits, cache misses, and branch prediction
                 statistics during the execution of programs. The
                 collected information can be used for power,
                 performance, and thermal management of the system as
                 well as detecting anomalies or malicious behavior in
                 the software. However, monitoring new or complex events
                 using HPCs and existing tools is a challenging task
                 because HPCs only provide a fixed pool of raw events to
                 monitor. To address this challenge, we propose the
                 implementation of a programmable hardware monitor in a
                 complete system framework including the hardware
                 monitor architecture and its interface with an in-order
                 single-issue RISC-V processor as well as an operating
                 system. As a proof of concept, we demonstrate how to
                 programmatically implement a shadow stack using our
                 hardware monitor and how the programmed shadow stack
                 detects stack buffer overflow attacks. Our hardware
                 monitor design incurs a 26 percent power overhead and a
                 15 percent area overhead over an unmodified RISC-V
                 processor. Our programmed shadow stack has less than 3
                 percent performance overhead in the worst case.",
  acknowledgement = ack-nhfb,
  affiliation =  "Delshadtehrani, L (Reprint Author), Boston Univ, Dept
                 Elect \& Comp Engn, Boston, MA 02215 USA.
                 Delshadtehrani, Leila; Eldridge, Schuyler; Canakci,
                 Sadullah; Egele, Manuel; Joshi, Ajay, Boston Univ, Dept
                 Elect \& Comp Engn, Boston, MA 02215 USA.",
  author-email = "delshad@bu.edu schuye@bu.edu scanakci@bu.edu
                 megele@bu.edu joshi@bu.edu",
  da =           "2019-06-20",
  doc-delivery-number = "FZ6EO",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "NSF [CCF-1533663]",
  funding-text = "We thank Prof. Jonathan Appavoo for providing
                 invaluable help in designing the OS support and the
                 software interface for Nile. This work was supported in
                 part by NSF grant CCF-1533663.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "branch prediction statistics; cache hits; cache
                 misses; cache storage; complete system framework;
                 complex events; coprocessors; Coprocessors; debugging;
                 fixed pool; Hardware; Hardware coprocessor; hardware
                 monitor architecture; hardware monitor design; hardware
                 performance counters; HPCs; Linux; malicious behavior;
                 Monitoring; Nile; operating system; operating systems
                 (computers); Pattern matching; performance evaluation;
                 performance overhead; power overhead; profiling tools;
                 Program processors; programmable hardware; programmable
                 hardware monitor; programmable monitoring coprocessor;
                 programmed shadow stack; raw events; reduced
                 instruction set computing; Rockets; security; shadow
                 stack; single-issue RISC-V processor; stack buffer
                 overflow attack; stack buffer overflow attacks; thermal
                 management; unmodified RISC-V processor",
  number-of-cited-references = "17",
  ORCID-numbers = "Joshi, AJay/0000-0002-3256-9942",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Delshadtehrani:2018:NPM",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@InProceedings{Bocco:2019:DPN,
  author =       "Andrea Bocco and Yves Durand and Florent de Dinechin",
  title =        "Dynamic Precision Numerics Using a Variable-Precision
                 {UNUM Type I HW} Coprocessor",
  crossref =     "Takagi:2019:ISC",
  pages =        "104--107",
  month =        jun,
  year =         "2019",
  DOI =          "https://doi.org/10.1109/ARITH.2019.00028",
  bibdate =      "Fri Jan 31 08:18:07 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/fparith.bib;
                 http://www.math.utah.edu/pub/tex/bib/risc-v.bib",
  abstract =     "A very large internal accumulation register has been
                 proposed to increase the accuracy of scientific code.
                 However, there is a general class of iterative kernels
                 where a vector of high-precision data must be saved
                 from one iteration to the next. Saving the large
                 internal accumulator to memory is impractical in such
                 cases. This work proposes a Variable Precision (VP)
                 Floating Point (FP) arithmetic co-processor
                 architecture based on RISC-V, which 1/ supports legacy
                 IEEE formats for input and output variables, 2/ uses
                 variable length internal registers (up to 512 bits of
                 mantissa) for inner loop multiply-add and 3/ supports
                 loads and stores of intermediate results to cache
                 memory with a dynamically adjustable precision (up to
                 256 bits of mantissa). It exploits the UNUM type I
                 floating point format, proposing solutions to address
                 some of its pitfalls such as the variable latency of
                 the internal operation, and the variable memory
                 footprint of the intermediate variables. This work is
                 integrated on FPGA and demonstrated on a representative
                 example.",
  acknowledgement = ack-nhfb,
  keywords =     "ARITH-26; Arrays; cache storage; Computational
                 modeling; coprocessors; Coprocessors; dynamic Precision
                 numerics; field programmable gate arrays; floating
                 point arithmetic; floating point arithmetic
                 co-processor architecture; FPGA; internal accumulation
                 register; iterative kernels; iterative methods; Kernel;
                 Programming; reduced instruction set computing;
                 Registers; RISC-V; Variable Precision; Variable
                 precision, Floating-point, UNUM, Scientific computing,
                 Instruction set design, Hardware architecture, RISC-V,
                 Coprocessor, Multiple precision, FPGA, ASIC;
                 variable-precision UNUM Type I HW coprocessor",
}

@InProceedings{Bocco:2019:SSM,
  author =       "Andrea Bocco and Yves Durand and Florent de Dinechin",
  title =        "{SMURF}: {Scalar Multiple-precision Unum Risc-V
                 Floating-point} Accelerator for Scientific Computing,",
  crossref =     "Gustafson:2019:PCN",
  pages =        "1:1--1:8",
  year =         "2019",
  DOI =          "https://doi.org/10.1145/3316279.3316280",
  bibdate =      "Mon Feb 10 09:31:49 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/fparith.bib;
                 http://www.math.utah.edu/pub/tex/bib/risc-v.bib",
  URL =          "https://hal.inria.fr/hal-02087098",
  abstract =     "This paper proposes an innovative Floating Point (FP)
                 architecture for Variable Precision (VP) computation
                 suitable for high precision FP computing, based on a
                 refined version of the UNUM type I format. This
                 architecture supports VP FP intervals where each
                 interval endpoint can have up to 512 bits of mantissa.
                 The proposed hardware architecture is pipelined and has
                 an internal word-size of 64 bits. Computations on
                 longer mantissas are performed iteratively on the
                 existing hardware. The prototype is integrated in a
                 RISC-V environment, it is exposed to the user through
                 an instruction set extension. The paper we provide an
                 example of software usage. The system has been
                 prototyped on a FPGA (Field-Programmable Gate Array)
                 platform and also synthesized for a 28nm FDSOI process
                 technology. The respective working frequency of FPGA
                 and ASIC implementations are 50MHz and 600MHz. The
                 estimated chip area is 1.5mm 2 and the estimated power
                 consumption is 95mW. The flops performance of this
                 architecture remains within the range of a regular
                 fixed-precision IEEE FPU while enabling arbitrary
                 precision computation at reasonable cost.",
  acknowledgement = ack-nhfb,
  articleno =    "Article 1",
  keywords =     "ASIC, UNUM, Floating-point, RISC-V, Coprocessor,
                 Instruction set design, Variable precision, Scientific
                 computing, Hardware architecture, Multiple precision,
                 FPGA",
}

@Article{Dogan:2019:ASU,
  author =       "Halit Dogan and Masab Ahmad and Brian Kahne and Omer
                 Khan",
  title =        "Accelerating Synchronization Using Moving Compute to
                 Data Model at 1,000-core Multicore Scale",
  journal =      j-TACO,
  volume =       "16",
  number =       "1",
  pages =        "4:1--4:??",
  month =        mar,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3300208",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Mar 11 19:00:20 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/risc-v.bib;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Thread synchronization using shared memory hardware
                 cache coherence paradigm is prevalent in multicore
                 processors. However, as the number of cores increase on
                 a chip, cache line ping-pong prevents performance
                 scaling for algorithms that deploy fine-grain
                 synchronization. This article proposes an in-hardware
                 moving computation to data model (MC) that pins shared
                 data at dedicated cores. The critical code sections are
                 serialized and executed at these cores in a spatial
                 setting to enable data locality optimizations.
                 In-hardware messages enable non-blocking and blocking
                 communication between cores, without involving the
                 cache coherence protocol. The in-hardware MC model is
                 implemented on Tilera Tile-Gx72 multicore platform to
                 evaluate 8- to 64-core count scale. A simulated RISC-V
                 multicore environment is built to further evaluate the
                 performance scaling advantages of the MC model at
                 1,024-cores scale. The evaluation using graph and
                 machine-learning benchmarks illustrates that atomic
                 instructions based synchronization scales up to 512
                 cores, and the MC model at the same core count
                 outperforms by 27\% in completion time and 39\% in
                 dynamic energy consumption.",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Huang:2019:ILA,
  author =       "Bo-Yuan Huang and Hongce Zhang and Pramod Subramanyan
                 and Yakir Vizel and Aarti Gupta and Sharad Malik",
  title =        "Instruction-Level Abstraction {(ILA)}: a Uniform
                 Specification for System-on-Chip {(SoC)} Verification",
  journal =      j-TODAES,
  volume =       "24",
  number =       "1",
  pages =        "10:1--10:??",
  month =        jan,
  year =         "2019",
  CODEN =        "ATASFO",
  DOI =          "https://doi.org/10.1145/3282444",
  ISSN =         "1084-4309 (print), 1557-7309 (electronic)",
  ISSN-L =       "1084-4309",
  bibdate =      "Fri Mar 22 16:58:40 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/risc-v.bib;
                 http://www.math.utah.edu/pub/tex/bib/todaes.bib",
  abstract =     "Modern Systems-on-Chip (SoC) designs are increasingly
                 heterogeneous and contain specialized semi-programmable
                 accelerators in addition to programmable processors. In
                 contrast to the pre-accelerator era, when the ISA
                 played an important role in verification by enabling a
                 clean separation of concerns between software and
                 hardware, verification of these ``accelerator-rich''
                 SoCs presents new challenges. From the perspective of
                 hardware designers, there is a lack of a common
                 framework for formal functional specification of
                 accelerator behavior. From the perspective of software
                 developers, there exists no unified framework for
                 reasoning about software/hardware interactions of
                 programs that interact with accelerators. This article
                 addresses these challenges by providing a formal
                 specification and high-level abstraction for
                 accelerator functional behavior. It formalizes the
                 concept of an Instruction Level Abstraction (ILA),
                 developed informally in our previous work, and shows
                 its application in modeling and verification of
                 accelerators. This formal ILA extends the familiar
                 notion of instructions to accelerators and provides a
                 uniform, modular, and hierarchical abstraction for
                 modeling software-visible behavior of both accelerators
                 and programmable processors. We demonstrate the
                 applicability of the ILA through several case studies
                 of accelerators (for image processing, machine
                 learning, and cryptography), and a general-purpose
                 processor (RISC-V). We show how the ILA model
                 facilitates equivalence checking between two ILAs, and
                 between an ILA and its hardware finite-state machine
                 (FSM) implementation. Further, this equivalence
                 checking supports accelerator upgrades using the notion
                 of ILA compatibility, similar to processor upgrades
                 using ISA compatibility.",
  acknowledgement = ack-nhfb,
  articleno =    "10",
  fjournal =     "ACM Transactions on Design Automation of Electronic
                 Systems",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J776",
}

@Article{Ramos:2019:APM,
  author =       "A. Ramos and R. G. Toral and P. Reviriego and J. A.
                 Maestro",
  title =        "An {ALU} Protection Methodology for Soft Processors on
                 {SRAM}-Based {FPGAs}",
  journal =      j-IEEE-TRANS-COMPUT,
  volume =       "68",
  number =       "9",
  pages =        "1404--1410",
  month =        sep,
  year =         "2019",
  CODEN =        "ITCOB4",
  DOI =          "https://doi.org/10.1109/TC.2019.2907238",
  ISSN =         "0018-9340 (print), 1557-9956 (electronic)",
  ISSN-L =       "0018-9340",
  bibdate =      "Fri Aug 30 05:58:40 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeetranscomput2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/risc-v.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Computers",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12",
  keywords =     "adaptive protection; ALU; ALU protection methodology;
                 application-based methodology; arithmetic logic unit;
                 cosmic background radiation; cosmic radiation; digital
                 arithmetic; fault tolerance; Fault tolerant systems;
                 Field programmable gate arrays; field programmable gate
                 arrays; FPGA; hardware configuration library;
                 integrated circuit reliability; logic testing;
                 Microprocessors; modular redundancy techniques; Program
                 processors; radiation hardening (electronics);
                 redundancy; Redundancy; RISC-V; soft core; soft error;
                 soft errors; soft processor; space missions; SRAM
                 chips; SRAM-based FPGA; TMR configurations",
}

@Article{Rogers:2019:SLB,
  author =       "Samuel Rogers and Joshua Slycord and Ronak Raheja and
                 Hamed Tabkhi",
  title =        "Scalable {LLVM}-Based Accelerator Modeling in gem5",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "18",
  number =       "1",
  pages =        "18--21",
  month =        jan # "\slash " # jun,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2019.2893932",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 http://www.math.utah.edu/pub/tex/bib/python.bib;
                 http://www.math.utah.edu/pub/tex/bib/risc-v.bib",
  abstract =     "This article proposes a scalable integrated system
                 architecture modeling for hardware accelerator based in
                 gem5 simulation framework. The core of proposed
                 modeling is a LLVM-based simulation engine for modeling
                 any customized data-path with respect to inherent
                 data/instruction-level parallelism (derived by
                 algorithms) and available compute units (defined by the
                 user). The simulation framework also offers a
                 general-purpose communication interface that allows a
                 scalable and flexible connection into the gem5
                 ecosystem. Python API of gem5, enabling modifications
                 to the system hierarchy without the need to rebuild the
                 underlying simulator. Our simulation framework
                 currently supports full-system simulation (both
                 bare-metal and a full Linux kernel) for ARM-based
                 systems, with future plans to add support for RISC-V.
                 The LLVM-based modeling and modular integration to gem5
                 allow long-term simulation expansion and sustainable
                 design modeling for emerging applications with demands
                 for acceleration.",
  acknowledgement = ack-nhfb,
  affiliation =  "Rogers, S (Reprint Author), Univ Noth Carolina, Dept
                 Elect \& Comp Engn, Charlotte, NC 28223 USA. Rogers,
                 Samuel; Slycord, Joshua; Raheja, Ronak; Tabkhi, Hamed,
                 Univ Noth Carolina, Dept Elect \& Comp Engn, Charlotte,
                 NC 28223 USA.",
  author-email = "sroger48@uncc.edu jslycord@uncc.edu rraheja@uncc.edu
                 htabkhiv@uncc.edu",
  da =           "2019-06-20",
  doc-delivery-number = "HL5MF",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "application program interfaces; ARM-based systems;
                 Computational modeling; Computer architecture
                 simulation; customized data-path; Engines; field
                 programmable gate arrays; flexible connection;
                 full-system simulation; gem5 ecosystem; gem5 simulation
                 framework; general-purpose communication interface;
                 Hardware; hardware accelerator; hardware accelerators;
                 heterogeneous systems; inherent data; instruction-level
                 parallelism; Linux; LLVM-based modeling; LLVM-based
                 simulation engine; logic design; long-term simulation
                 expansion; microprocessor chips; multiprocessing
                 systems; parallel architectures; parallel programming;
                 program compilers; reduced instruction set computing;
                 Registers; RISC-V; Runtime; scalable connection;
                 scalable integrated system architecture modeling;
                 scalable LLVM-based accelerator modeling; Space
                 exploration; sustainable design modeling;
                 Synchronization; system hierarchy",
  number-of-cited-references = "11",
  ORCID-numbers = "Slycord, Joshua/0000-0002-0569-4094 Rogers,
                 Samuel/0000-0002-9697-2933",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Rogers:2019:SLB",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Tiwari:2019:PPE,
  author =       "Sugandha Tiwari and Neel Gala and Chester Rebeiro and
                 V. Kamakoti",
  title =        "{PERI}: A Posit Enabled {RISC-V} Core",
  journal =      "arXiv.org",
  volume =       "??",
  number =       "??",
  pages =        "1--14",
  month =        nov,
  year =         "2019",
  bibdate =      "Thu Apr 09 15:06:39 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/fparith.bib;
                 http://www.math.utah.edu/pub/tex/bib/risc-v.bib",
  URL =          "https://arxiv.org/pdf/1908.01466.pdf",
  abstract =     "Owing to the failure of Dennard's scaling the last
                 decade has seen a steep growth of prominent new
                 paradigms leveraging opportunities in computer
                 architecture. Two technologies of interest are Posit
                 and RISC-V. Posit was introduced in mid-2017 as a
                 viable alternative to IEEE 754-2008. Posit promises
                 more accuracy, higher dynamic range and fewer unused
                 states along with simpler hardware designs as compared
                 to IEEE 754- 2008. RISC-V, on the other hand, provides
                 a commercial-grade open-source ISA. It is not only
                 elegant and simple but also highly extensible and
                 customizable, thereby facilitating novel
                 micro-architectural research and exploration. In this
                 paper, we bring these two technologies together and
                 propose the first Posit Enabled RISC-V core. The paper
                 provides insights on how the current 'F' extension and
                 the custom op-code space of RISCV can be
                 leveraged/modified to support Posit arithmetic. We also
                 present implementation details of a parameterized and
                 feature-complete Posit FPU which is integrated with the
                 RISC-V compliant SHAKTI C-class core either as an
                 execution unit or as an accelerator. To fully leverage
                 the potential of Posit, we further enhance our Posit
                 FPU, with minimal overheads, to support two different
                 exponent sizes (with posit-size being 32-bits). This
                 allows applications to switch from high-accuracy
                 computation mode to a mode with higher dynamic-range at
                 run-time. In the absence of viable software tool-chain
                 to enable porting of applications in the Posit domain,
                 we present a workaround on how certain applications can
                 be modified minimally to exploit the existing RISC-V
                 tool-chain. We also provide examples of applications
                 which can perform better with Posit as compared to IEEE
                 754-2008. The proposed Posit FPU consumes 3507 slice
                 LUTs and 1294 slice registers on an Artix-7-100T Xilinx
                 FPGA while capable of operating at 100 MHz.",
  acknowledgement = ack-nhfb,
  keywords =     "floating-point; IEEE-754; Posit; processor; RISC-V",
}

@Article{Zhang:2019:CBB,
  author =       "S. Zhang and A. Wright and T. Bourgeat",
  title =        "Composable Building Blocks to Open Up Processor
                 Design",
  journal =      j-IEEE-MICRO,
  volume =       "39",
  number =       "3",
  pages =        "47--55",
  month =        may # "\slash " # jun,
  year =         "2019",
  CODEN =        "IEMIDZ",
  DOI =          "https://doi.org/10.1109/MM.2019.2910012",
  ISSN =         "0272-1732 (print), 1937-4143 (electronic)",
  ISSN-L =       "0272-1732",
  bibdate =      "Thu Jul 25 15:33:44 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeemicro.bib;
                 http://www.math.utah.edu/pub/tex/bib/risc-v.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Micro",
  journal-URL =  "http://www.computer.org/csdl/mags/mi/index.html",
  keywords =     "atomic rules; atomic updates; CMD ensure
                 composability; composable building blocks; composable
                 modular design; Concurrent computing; instantaneous
                 access; interface method; Linux; Microarchitecture;
                 microprocessor chips; Out of order; out-of-order
                 processors; out-of-order RISC-V processor; processor
                 design; reduced instruction set computing; Registers;
                 software architecture; state elements; System recovery;
                 Timing; Wires",
}

@Misc{Anonymous:2020:RVE,
  author =       "Anonymous",
  title =        "{RISC-V} embedded variant {RV32E} now fully supported
                 by {SEGGER}'s Floating-Point library",
  howpublished = "Web site",
  day =          "21",
  month =        sep,
  year =         "2020",
  bibdate =      "Thu Jan 28 18:02:53 2021",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/fparith.bib;
                 http://www.math.utah.edu/pub/tex/bib/risc-v.bib",
  URL =          "https://www.design-reuse.com/news/48672/segger-s-floating-point-library-risc-v-rv32e.html",
  acknowledgement = ack-nhfb,
  remark =       "The story reports a significant code size reduction,
                 and speedup, over the GNU floating-point library.",
}

@Article{Greengard:2020:NWR,
  author =       "Samuel Greengard",
  title =        "News: Will {RISC-V} revolutionize computing?",
  journal =      j-CACM,
  volume =       "63",
  number =       "5",
  pages =        "30--32",
  month =        may,
  year =         "2020",
  CODEN =        "CACMA2",
  DOI =          "https://doi.org/10.1145/3386377",
  ISSN =         "0001-0782 (print), 1557-7317 (electronic)",
  ISSN-L =       "0001-0782",
  bibdate =      "Tue Apr 21 15:30:10 MDT 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/cacm2020.bib;
                 http://www.math.utah.edu/pub/tex/bib/risc-v.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3386377",
  abstract =     "The open instruction set for microprocessors promises
                 to reshape computing and introduce new, more powerful
                 capabilities.",
  acknowledgement = ack-nhfb,
  fjournal =     "Communications of the ACM",
  journal-URL =  "https://dl.acm.org/loi/cacm",
}

@Article{Horne:2020:GSF,
  author =       "Mitchell Horne",
  title =        "Getting Started with {FreeBSD\slash RISC-V}",
  journal =      "FreeBSD Journal",
  volume =       "??",
  number =       "??",
  pages =        "12--17",
  month =        jan # "\slash " # feb,
  year =         "2020",
  bibdate =      "Fri Dec 23 11:24:52 2022",
  URL =          "https://freebsdfoundation.org/wp-content/uploads/2020/03/Getting-Started-With-FreeBSD-RISC-V.pdf",
  acknowledgement = ack-nhfb,
  journal-URL =  "https://freebsdfoundation.org/our-work/journal/",
  remark =       "This article contains a clear description of
                 cross-compiling a kernel for FreeBSD on RISC-V on
                 another (non-RISC-V) FreeBSD system, creating a virtual
                 image, booting the new image with qemu-system-riscv64,
                 enabling networking, and later doing cross-compiled
                 kernel updates.",
}

@Article{Petrisko:2020:BAO,
  author =       "D. Petrisko and F. Gilani and M. Wyse and D. C. Jung
                 and S. Davidson and P. Gao and C. Zhao and Z. Azad and
                 S. Canakci and B. Veluri and T. Guarino and A. Joshi
                 and M. Oskin and M. B. Taylor",
  title =        "{BlackParrot}: An Agile Open-Source {RISC-V} Multicore
                 for Accelerator {SoCs}",
  journal =      j-IEEE-MICRO,
  volume =       "40",
  number =       "4",
  pages =        "93--102",
  month =        jul # "\slash " # aug,
  year =         "2020",
  CODEN =        "IEMIDZ",
  DOI =          "https://doi.org/10.1109/MM.2020.2996145",
  ISSN =         "0272-1732 (print), 1937-4143 (electronic)",
  ISSN-L =       "0272-1732",
  bibdate =      "Wed Jul 29 07:59:51 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/gnu.bib;
                 http://www.math.utah.edu/pub/tex/bib/ieeemicro.bib;
                 http://www.math.utah.edu/pub/tex/bib/linux.bib;
                 http://www.math.utah.edu/pub/tex/bib/risc-v.bib;
                 http://www.math.utah.edu/pub/tex/bib/unix.bib",
  abstract =     "This article introduces BlackParrot, which aims to be
                 the default open-source, Linux-capable, cache-coherent,
                 64-bit RISC-V multicore used by the world. In executing
                 this goal, our research aims to advance the world's
                 knowledge about the software engineering of hardware.
                 Although originally bootstrapped by the University of
                 Washington and Boston University via DARPA funding,
                 BlackParrot strives to be community driven and
                 infrastructure agnostic; a multicore which is Pareto
                 optimal in terms of power, performance, area, and
                 complexity. In order to ensure BlackParrot is easy to
                 use, extend, and, most importantly, trust, development
                 is guided by three core principles: Be Tiny, Be
                 Modular, and Be Friendly. Development efforts have
                 prioritized the use of intentional interfaces and
                 modularity and silicon validation as first-order design
                 metrics, so that users can quickly get started and
                 trust that their design will perform as expected when
                 deployed. BlackParrot has been validated in a
                 GlobalFoundries 12-nm FinFET tapeout. BlackParrot is
                 ideal as a standalone Linux processor or as a malleable
                 fabric for an agile accelerator SoC design flow.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Micro",
  journal-URL =  "http://www.computer.org/csdl/mags/mi/index.html",
}

@Misc{SEGGER:2020:SFP,
  author =       "{SEGGER Microcontroller}",
  title =        "{SEGGER} Floating-Point Library",
  howpublished = "Web site.",
  month =        jan,
  year =         "2020",
  bibdate =      "Fri Feb 07 06:02:26 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/fparith.bib;
                 http://www.math.utah.edu/pub/tex/bib/risc-v.bib",
  URL =          "https://www.segger.com/products/development-tools/runtime-library/technology/floating-point-library/",
  abstract =     "The floating-point library contains complete, fully
                 optimized and verified floating point functionality,
                 which is required for devices without an FPU. The
                 floating-point emulator, a crucial part of the
                 floating-point library, of the Arm and RISC-V variants
                 are written in assembly language, optimized either for
                 small code size or increased execution speed. For other
                 processor architectures the library has a portable C
                 implementation. \ldots{} The SEGGER Floating-Point
                 Library is delivered in source code, with optional
                 rights for redistribution in object code form. All
                 source files, a mix of C code and assembly, are fully
                 commented. The floating-point emulator, providing the
                 low-level functions, is entirely written in assembly.
                 Higher level functions are implemented as a mix of
                 primarily C code with some assembly routines. The code
                 can be compiled with any ANSO-compliant C compiler,
                 such as GCC, Clang, or IAR.",
  acknowledgement = ack-nhfb,
}

@Article{Zhang:2020:MRB,
  author =       "Jialiang Zhang and Yue Zha and Nicholas Beckwith and
                 Bangya Liu and Jing Li",
  title =        "{MEG}: a {RISCV}-based System Emulation Infrastructure
                 for Near-data Processing Using {FPGAs} and
                 High-bandwidth Memory",
  journal =      j-TRETS,
  volume =       "13",
  number =       "4",
  pages =        "19:1--19:24",
  month =        oct,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3409114",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Fri Oct 2 07:58:13 MDT 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/risc-v.bib;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3409114",
  abstract =     "Emerging three-dimensional (3D) memory technologies,
                 such as the Hybrid Memory Cube (HMC) and High Bandwidth
                 Memory (HBM), provide high-bandwidth and massive
                 memory-level parallelism. With the growing
                 heterogeneity and complexity of computer systems
                 \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "19",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "https://dl.acm.org/loi/trets",
}

@Article{Zhu:2020:HIR,
  author =       "Lingjun Zhu and Lennart Bamberg and Anthony Agnesina
                 and Francky Catthoor and Dragomir Milojevic and Manu
                 Komalan and Julien Ryckaert and Alberto Garcia-Ortiz
                 and Sung Kyu Lim",
  title =        "Heterogeneous {$3$D} Integration for a {RISC-V} System
                 With {STT-MRAM}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "19",
  number =       "1",
  pages =        "51--54",
  month =        jan # "\slash " # jun,
  year =         "2020",
  DOI =          "https://doi.org/10.1109/LCA.2020.2992644",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu May 27 16:19:32 2021",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 http://www.math.utah.edu/pub/tex/bib/risc-v.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Biswas:2021:CSI,
  author =       "Arnab Kumar Biswas",
  title =        "Cryptographic Software {IP} Protection without
                 Compromising Performance or Timing Side-channel
                 Leakage",
  journal =      j-TACO,
  volume =       "18",
  number =       "2",
  pages =        "20:1--20:20",
  month =        mar,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3443707",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Mar 20 17:25:10 MDT 2021",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/cryptography2020.bib;
                 http://www.math.utah.edu/pub/tex/bib/risc-v.bib;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3443707",
  abstract =     "Program obfuscation is a widely used cryptographic
                 software intellectual property (IP) protection
                 technique against reverse engineering attacks in
                 embedded systems. However, very few works have studied
                 the impact of combining various obfuscation techniques
                 on the obscurity (difficulty of reverse engineering)
                 and performance (execution time) of obfuscated
                 programs. In this article, we propose a Genetic
                 Algorithm (GA)-based framework that not only optimizes
                 obscurity and performance of obfuscated cryptographic
                 programs, but it also ensures very low timing
                 side-channel leakage. Our proposed Timing Side Channel
                 Sensitive Program Obfuscation Optimization Framework
                 (TSC-SPOOF) determines the combination of obfuscation
                 transformation functions that produce optimized
                 obfuscated programs with preferred optimization
                 parameters. In particular, TSC-SPOOF employs normalized
                 compression distance (NCD) and channel capacity to
                 measure obscurity and timing side-channel leakage,
                 respectively. We also use RISC-V rocket core running on
                 a Xilinx Zynq FPGA device as part of our framework to
                 obtain realistic results. The experimental results
                 clearly show that our proposed solution leads to
                 cryptographic programs with lower execution time,
                 higher obscurity, and lower timing side-channel leakage
                 than unguided obfuscation.",
  acknowledgement = ack-nhfb,
  articleno =    "20",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Eliahu:2021:MME,
  author =       "Adi Eliahu and Ronny Ronen and Pierre-Emmanuel
                 Gaillardon and Shahar Kvatinsky",
  title =        "{multiPULPly}: a Multiplication Engine for
                 Accelerating Neural Networks on Ultra-low-power
                 Architectures",
  journal =      j-JETC,
  volume =       "17",
  number =       "2",
  pages =        "24:1--24:27",
  month =        apr,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3432815",
  ISSN =         "1550-4832",
  ISSN-L =       "1550-4832",
  bibdate =      "Fri Apr 30 06:39:29 MDT 2021",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/fparith.bib;
                 http://www.math.utah.edu/pub/tex/bib/jetc.bib;
                 http://www.math.utah.edu/pub/tex/bib/risc-v.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3432815",
  abstract =     "Computationally intensive neural network applications
                 often need to run on resource-limited low-power
                 devices. Numerous hardware accelerators have been
                 developed to speed up the performance of neural network
                 applications and reduce power consumption; however,
                 most focus on data centers and full-fledged systems.
                 Acceleration in ultra-low-power systems has been only
                 partially addressed. In this article, we present
                 multiPULPly, an accelerator that integrates memristive
                 technologies within standard low-power CMOS technology,
                 to accelerate multiplication in neural network
                 inference on ultra-low-power systems. This accelerator
                 was designated for PULP, an open-source microcontroller
                 system that uses low-power RISC-V processors.
                 Memristors were integrated into the accelerator to
                 enable power consumption only when the memory is
                 active, to continue the task with no context-restoring
                 overhead, and to enable highly parallel analog
                 multiplication. To reduce the energy consumption, we
                 propose novel dataflows that handle common
                 multiplication scenarios and are tailored for our
                 architecture. The accelerator was tested on FPGA and
                 achieved a peak energy efficiency of 19.5 TOPS/W,
                 outperforming state-of-the-art accelerators by $ 1.5
                 \times $ to $ 4.5 \times $.",
  acknowledgement = ack-nhfb,
  articleno =    "24",
  fjournal =     "ACM Journal on Emerging Technologies in Computing
                 Systems (JETC)",
  journal-URL =  "https://dl.acm.org/loi/jetc",
}

@Misc{Horne:2021:RQ,
  author =       "Mitchell Horne",
  title =        "{riscv\slash QEMU}",
  howpublished = "Web site",
  day =          "8",
  month =        jun,
  year =         "2021",
  bibdate =      "Fri Dec 23 12:03:37 2022",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/risc-v.bib",
  note =         "See also \cite{Horne:2020:GSF}.",
  URL =          "https://wiki.freebsd.org/riscv/QEMU",
  acknowledgement = ack-nhfb,
}

@Misc{Horne:2021:S,
  author =       "Mitchell Horne",
  title =        "Spike",
  howpublished = "Web site",
  day =          "8",
  month =        jun,
  year =         "2021",
  bibdate =      "Fri Dec 23 12:03:37 2022",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/risc-v.bib",
  URL =          "https://wiki.freebsd.org/riscv/Spike",
  abstract =     "Spike is the canonical RISC-V ISA simulator. It
                 supports several ISA extensions, including some that
                 are not yet ratified. See the README on GitHub for more
                 information.",
  acknowledgement = ack-nhfb,
}

@Article{Schuiki:2021:SSR,
  author =       "F. Schuiki and F. Zaruba and T. Hoefler and L.
                 Benini",
  title =        "Stream Semantic Registers: A Lightweight {RISC-V ISA}
                 Extension Achieving Full Compute Utilization in
                 Single-Issue Cores",
  journal =      j-IEEE-TRANS-COMPUT,
  volume =       "70",
  number =       "2",
  pages =        "212--227",
  month =        feb,
  year =         "2021",
  CODEN =        "ITCOB4",
  DOI =          "https://doi.org/10.1109/TC.2020.2987314",
  ISSN =         "0018-9340 (print), 1557-9956 (electronic)",
  ISSN-L =       "0018-9340",
  bibdate =      "Fri Jan 29 17:51:47 2021",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeetranscomput2020.bib;
                 http://www.math.utah.edu/pub/tex/bib/risc-v.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Computers",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12",
}

@Article{Szkandera:2021:BYO,
  author =       "Filip Szkandera",
  title =        "Build Your Own {RISC-V CPU}: Even Home-Brew Processors
                 Can use Hot New Tech",
  journal =      j-IEEE-SPECTRUM,
  volume =       "58",
  number =       "6",
  pages =        "16--18",
  month =        jun,
  year =         "2021",
  CODEN =        "IEESAM",
  DOI =          "https://doi.org/10.1109/MSPEC.2021.9444942",
  ISSN =         "0018-9235 (print), 1939-9340 (electronic)",
  ISSN-L =       "0018-9235",
  bibdate =      "Fri Jun 4 12:04:57 2021",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeespectrum2020.bib;
                 http://www.math.utah.edu/pub/tex/bib/risc-v.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Spectrum",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=6",
}

@Article{Tiwari:2021:PCP,
  author =       "Sugandha Tiwari and Neel Gala and Chester Rebeiro and
                 V. Kamakoti",
  title =        "{PERI}: a Configurable Posit Enabled {RISC-V} Core",
  journal =      j-TACO,
  volume =       "18",
  number =       "3",
  pages =        "25:1--25:26",
  month =        jun,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3446210",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Tue Jun 29 08:21:11 MDT 2021",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/risc-v.bib;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3446210",
  abstract =     "Owing to the failure of Dennard's scaling, the past
                 decade has seen a steep growth of prominent new
                 paradigms leveraging opportunities in computer
                 architecture. Two technologies of interest are Posit
                 and RISC-V. Posit was introduced in mid-2017 as a
                 \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "25",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Zaruba:2021:MCR,
  author =       "F. Zaruba and F. Schuiki and L. Benini",
  title =        "{Manticore}: A 4096-Core {RISC-V} Chiplet Architecture
                 for Ultraefficient Floating-Point Computing",
  journal =      j-IEEE-MICRO,
  volume =       "41",
  number =       "2",
  pages =        "36--42",
  month =        mar # "\slash " # apr,
  year =         "2021",
  CODEN =        "IEMIDZ",
  DOI =          "https://doi.org/10.1109/MM.2020.3045564",
  ISSN =         "0272-1732 (print), 1937-4143 (electronic)",
  ISSN-L =       "0272-1732",
  bibdate =      "Thu Apr 1 10:32:23 2021",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/fparith.bib;
                 http://www.math.utah.edu/pub/tex/bib/hot-chips.bib;
                 http://www.math.utah.edu/pub/tex/bib/ieeemicro.bib;
                 http://www.math.utah.edu/pub/tex/bib/risc-v.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Micro",
  journal-URL =  "http://www.computer.org/csdl/mags/mi/index.html",
}

@Article{Adit:2022:PLT,
  author =       "Neil Adit and Adrian Sampson",
  title =        "Performance Left on the Table: An Evaluation of
                 Compiler Autovectorization for {RISC-V}",
  journal =      j-IEEE-MICRO,
  volume =       "42",
  number =       "5",
  pages =        "41--48",
  month =        sep # "\slash " # oct,
  year =         "2022",
  CODEN =        "IEMIDZ",
  DOI =          "https://doi.org/10.1109/MM.2022.3184867",
  ISSN =         "0272-1732 (print), 1937-4143 (electronic)",
  ISSN-L =       "0272-1732",
  bibdate =      "Thu Nov 03 05:37:10 2022",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeemicro.bib;
                 http://www.math.utah.edu/pub/tex/bib/risc-v.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Micro",
  journal-URL =  "http://www.computer.org/csdl/mags/mi/index.html",
}

@Article{Alder:2022:FPU,
  author =       "Fritz Alder and Jo {Van Bulck} and Jesse Spielman and
                 David Oswald and Frank Piessens",
  title =        "Faulty Point Unit: {ABI} Poisoning Attacks on Trusted
                 Execution Environments",
  journal =      j-DTRAP,
  volume =       "3",
  number =       "2",
  pages =        "13:1--13:26",
  month =        jun,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3491264",
  ISSN =         "2692-1626 (print), 2576-5337 (electronic)",
  ISSN-L =       "2576-5337",
  bibdate =      "Sat Jul 30 07:34:14 MDT 2022",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/dtrap.bib;
                 http://www.math.utah.edu/pub/tex/bib/fparith.bib;
                 http://www.math.utah.edu/pub/tex/bib/risc-v.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3491264",
  abstract =     "This article analyzes a previously overlooked attack
                 surface that allows unprivileged adversaries to impact
                 floating-point computations in enclaves through the
                 Application Binary Interface (ABI). In a comprehensive
                 study across 7 industry-standard and esearch enclave
                 shielding runtimes for Intel Software Guard Extensions
                 (SGX), we show that control and state registers of the
                 x87 Floating-Point Unit (FPU) and Intel Streaming SIMD
                 Extensions are not always properly sanitized on enclave
                 entry. We furthermore show that this attack goes beyond
                 the x86 architecture and can also affect RISC-V
                 enclaves. Focusing on SGX, we abuse the adversary's
                 control over precision and rounding modes as an ABI
                 fault injection primitive to corrupt enclaved
                 floating-point operations. Our analysis reveals that
                 this is especially relevant for applications that use
                 the older x87 FPU, which is still under certain
                 conditions used by modern compilers. We exemplify the
                 potential impact of ABI quality-degradation attacks for
                 enclaved machine learning and for the SPEC benchmarks.
                 We then explore the impact on confidentiality, showing
                 that control over exception masks can be abused as a
                 controlled channel to recover enclaved multiplication
                 operands. Our findings, affecting 5 of 7 studied SGX
                 runtimes and one RISC-V runtime, demonstrate the
                 challenges of implementing high-assurance trusted
                 execution across computing architectures.",
  acknowledgement = ack-nhfb,
  articleno =    "13",
  fjournal =     "Digital Threats: Research and Practice (DTRAP)",
  journal-URL =  "https://dl.acm.org/loi/dtrap",
}

@Article{Amor:2022:RVI,
  author =       "Hela Belhadj Amor and Carolynn Bernier and
                 Zden{\v{e}}k P{\v{r}}ikryl",
  title =        "A {RISC-V ISA} Extension for Ultra-Low Power {IoT}
                 Wireless Signal Processing",
  journal =      j-IEEE-TRANS-COMPUT,
  volume =       "71",
  number =       "4",
  pages =        "766--778",
  month =        apr,
  year =         "2022",
  CODEN =        "ITCOB4",
  DOI =          "https://doi.org/10.1109/TC.2021.3063027",
  ISSN =         "0018-9340 (print), 1557-9956 (electronic)",
  ISSN-L =       "0018-9340",
  bibdate =      "Thu Mar 17 06:38:17 2022",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeetranscomput2020.bib;
                 http://www.math.utah.edu/pub/tex/bib/risc-v.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Computers",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12",
}

@Article{Ditzel:2022:AMR,
  author =       "David R. Ditzel and the Esperanto team",
  title =        "Accelerating {ML} Recommendation With Over 1,000
                 {RISC-V\slash Tensor} Processors on {Esperanto}'s
                 {ET-SoC-1} Chip",
  journal =      j-IEEE-MICRO,
  volume =       "42",
  number =       "3",
  pages =        "31--38",
  month =        may # "\slash " # jun,
  year =         "2022",
  CODEN =        "IEMIDZ",
  DOI =          "https://doi.org/10.1109/MM.2022.3140674",
  ISSN =         "0272-1732 (print), 1937-4143 (electronic)",
  ISSN-L =       "0272-1732",
  bibdate =      "Fri May 27 06:13:54 2022",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeemicro.bib;
                 http://www.math.utah.edu/pub/tex/bib/risc-v.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Micro",
  journal-URL =  "http://www.computer.org/csdl/mags/mi/index.html",
}

@Article{Feng:2022:RRV,
  author =       "Lang Feng and Jiayi Huang and Luyi Li and Haochen
                 Zhang and Zhongfeng Wang",
  title =        "{RvDfi}: a {RISC-V} Architecture With Security
                 Enforcement by High Performance Complete Data-Flow
                 Integrity",
  journal =      j-IEEE-TRANS-COMPUT,
  volume =       "71",
  number =       "10",
  pages =        "2499--2512",
  month =        oct,
  year =         "2022",
  CODEN =        "ITCOB4",
  DOI =          "https://doi.org/10.1109/TC.2021.3133701",
  ISSN =         "0018-9340 (print), 1557-9956 (electronic)",
  ISSN-L =       "0018-9340",
  bibdate =      "Thu Sep 8 07:59:47 2022",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeetranscomput2020.bib;
                 http://www.math.utah.edu/pub/tex/bib/risc-v.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Computers",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12",
}

@Article{Mariotti:2022:WVB,
  author =       "Gianfranco Mariotti and Roberto Giorgi",
  title =        "\pkg{WebRISC-V}: a 32\slash 64-bit {RISC-V} pipeline
                 simulation tool",
  journal =      j-SOFTWAREX,
  volume =       "18",
  number =       "??",
  pages =        "??--??",
  month =        jun,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1016/j.softx.2022.101105",
  ISSN =         "2352-7110",
  ISSN-L =       "2352-7110",
  bibdate =      "Thu Jun 2 09:45:22 MDT 2022",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/risc-v.bib;
                 http://www.math.utah.edu/pub/tex/bib/softwarex.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S235271102200070X",
  acknowledgement = ack-nhfb,
  articleno =    "101105",
  fjournal =     "SoftwareX",
  journal-URL =  "https://www.sciencedirect.com/journal/softwarex/issues",
}

@Article{Sa:2022:FLR,
  author =       "Bruno S{\'a} and Jos{\'e} Martins and Sandro Pinto",
  title =        "A First Look at {RISC-V} Virtualization From an
                 Embedded Systems Perspective",
  journal =      j-IEEE-TRANS-COMPUT,
  volume =       "71",
  number =       "9",
  pages =        "2177--2190",
  month =        sep,
  year =         "2022",
  CODEN =        "ITCOB4",
  DOI =          "https://doi.org/10.1109/TC.2021.3124320",
  ISSN =         "0018-9340 (print), 1557-9956 (electronic)",
  ISSN-L =       "0018-9340",
  bibdate =      "Thu Aug 11 09:05:14 2022",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeetranscomput2020.bib;
                 http://www.math.utah.edu/pub/tex/bib/risc-v.bib;
                 http://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Computers",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12",
}

@Article{Saarinen:2022:DRV,
  author =       "Markku-Juhani O. Saarinen and G. Richard Newell and
                 Ben Marshall",
  title =        "Development of the {RISC-V} entropy source interface",
  journal =      j-J-CRYPTO-ENG,
  volume =       "12",
  number =       "4",
  pages =        "371--386",
  month =        nov,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1007/s13389-021-00275-6",
  ISSN =         "2190-8508 (print), 2190-8516 (electronic)",
  ISSN-L =       "2190-8508",
  bibdate =      "Fri Jun 2 12:32:09 MDT 2023",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jcryptoeng.bib;
                 http://www.math.utah.edu/pub/tex/bib/linux.bib;
                 http://www.math.utah.edu/pub/tex/bib/prng.bib;
                 http://www.math.utah.edu/pub/tex/bib/risc-v.bib;
                 http://www.math.utah.edu/pub/tex/bib/unix.bib",
  URL =          "https://link.springer.com/article/10.1007/s13389-021-00275-6",
  abstract =     "The RISC-V true random number generator (TRNG)
                 architecture breaks with previous ISA TRNG practice by
                 splitting the entropy source (ES) component away from
                 cryptographic DRBGs into a separate privileged
                 interface, and in its use of polling. The modular
                 approach is suitable for the RISC-V hardware IP
                 ecosystem, allows a significantly smaller
                 implementation footprint on platforms that need it,
                 while directly supporting current standards compliance
                 testing methods. We describe the interface, its use in
                 cryptography, and offer additional discussion,
                 background, and rationale for various aspects of it.
                 The design was informed by lessons learned from earlier
                 mainstream ISAs, recently introduced SP 800-90B and
                 FIPS 140-3 entropy audit requirements, AIS 31 and
                 common criteria, current and emerging cryptographic
                 needs such as post-quantum cryptography, and the goal
                 of supporting a wide variety of RISC-V implementations
                 and applications. Many of the architectural choices
                 result from quantitative observations about random
                 number generators in secure microcontrollers, the Linux
                 kernel, and cryptographic libraries.",
  acknowledgement = ack-nhfb,
  ajournal =     "J. Crypto. Eng.",
  fjournal =     "Journal of Cryptographic Engineering",
  journal-URL =  "http://link.springer.com/journal/13389",
}

@Article{Vijaykumar:2022:MPO,
  author =       "Nandita Vijaykumar and Ataberk Olgun and Konstantinos
                 Kanellopoulos and F. Nisa Bostanci and Hasan Hassan and
                 Mehrshad Lotfi and Phillip B. Gibbons and Onur Mutlu",
  title =        "\pkg{MetaSys}: a Practical Open-source Metadata
                 Management System to Implement and Evaluate Cross-layer
                 Optimizations",
  journal =      j-TACO,
  volume =       "19",
  number =       "2",
  pages =        "26:1--26:29",
  month =        jun,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3505250",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Mar 25 07:03:00 MDT 2022",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/gnu.bib;
                 http://www.math.utah.edu/pub/tex/bib/risc-v.bib;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3505250",
  abstract =     "This article introduces the first open-source
                 FPGA-based infrastructure, MetaSys, with a prototype in
                 a RISC-V system, to enable the rapid implementation and
                 evaluation of a wide range of cross-layer techniques in
                 real hardware. Hardware-software \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "26",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Zhang:2022:TMT,
  author =       "Jipeng Zhang and Junhao Huang and Zhe Liu and Sujoy
                 Sinha Roy",
  title =        "Time-Memory Trade-Offs for {Saber+} on
                 Memory-Constrained {RISC-V} Platform",
  journal =      j-IEEE-TRANS-COMPUT,
  volume =       "71",
  number =       "11",
  pages =        "2996--3007",
  month =        nov,
  year =         "2022",
  CODEN =        "ITCOB4",
  DOI =          "https://doi.org/10.1109/TC.2022.3143441",
  ISSN =         "0018-9340 (print), 1557-9956 (electronic)",
  ISSN-L =       "0018-9340",
  bibdate =      "Thu Oct 27 15:52:25 2022",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeetranscomput2020.bib;
                 http://www.math.utah.edu/pub/tex/bib/risc-v.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Computers",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12",
}

@Misc{Zeeb:2022:RV,
  author =       "Bjoern Zeeb",
  title =        "{RISC-V}",
  howpublished = "Web site",
  day =          "26",
  month =        jun,
  year =         "2022",
  bibdate =      "Fri Dec 23 12:05:46 2022",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/risc-v.bib",
  URL =          "https://wiki.freebsd.org/riscv",
  abstract =     "FreeBSD/RISC-V is a architecture port for FreeBSD to
                 run on the RISC-V Instruction-Set Architecture (ISA),
                 able to boot to multi-user mode on the QEMU emulator,
                 Spike simulator and real hardware.",
  acknowledgement = ack-nhfb,
}

@Article{Gomez:2023:HLV,
  author =       "Constantino G{\'o}mez and Filippo Mantovani and Erich
                 Focht and Marc Casas",
  title =        "{HPCG} on long-vector architectures: Evaluation and
                 optimization on {NEC SX-Aurora} and {RISC-V}",
  journal =      j-FUT-GEN-COMP-SYS,
  volume =       "143",
  number =       "??",
  pages =        "152--162",
  month =        jun,
  year =         "2023",
  CODEN =        "FGSEVI",
  DOI =          "https://doi.org/10.1016/j.future.2023.01.015",
  ISSN =         "0167-739X (print), 1872-7115 (electronic)",
  ISSN-L =       "0167-739X",
  bibdate =      "Mon Mar 13 08:24:01 MDT 2023",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/futgencompsys2020.bib;
                 http://www.math.utah.edu/pub/tex/bib/risc-v.bib",
  URL =          "http://www.sciencedirect.com/science/article/pii/S0167739X23000225",
  abstract =     "Accelerators are becoming a key component to improve
                 efficiency in High-Performance Computing systems (HPC).
                 While GPU based systems are widely used to accelerate
                 HPC workloads, new systems based on long-vector
                 architectures are rapidly gaining popularity. The
                 development of optimized math libraries becomes
                 fundamental to achieve high performance in those
                 emerging vector architectures. This paper focuses on
                 the optimization of the HPCG benchmark, which comprises
                 four fundamental kernels found in many numerical
                 applications. We target two relevant long-vector
                 architectures like the NEC Vector Engine and the RISC-V
                 `V' vector extension. Compared to the well-tuned
                 proprietary solution, our open HPCG implementation
                 achieves a 1.6\% improvement in performance on the NEC
                 Vector Engine and achieves near maximum memory
                 bandwidth utilization in the two evaluated RISC-V
                 vector accelerator designs.",
  acknowledgement = ack-nhfb,
  fjournal =     "Future Generation Computer Systems",
  journal-URL =  "http://www.sciencedirect.com/science/journal/0167739X",
}

@Article{Gruin:2023:MTP,
  author =       "Alban Gruin and Thomas Carle and Christine Rochange
                 and Hugues Cass{\'e} and Pascal Sainrat",
  title =        "{MINOTAuR}: A Timing Predictable {RISC-V} Core
                 Featuring Speculative Execution",
  journal =      j-IEEE-TRANS-COMPUT,
  volume =       "72",
  number =       "1",
  pages =        "183--195",
  month =        jan,
  year =         "2023",
  CODEN =        "ITCOB4",
  DOI =          "https://doi.org/10.1109/TC.2022.3200000",
  ISSN =         "0018-9340 (print), 1557-9956 (electronic)",
  ISSN-L =       "0018-9340",
  bibdate =      "Mon Dec 19 08:41:53 2022",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeetranscomput2020.bib;
                 http://www.math.utah.edu/pub/tex/bib/risc-v.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Computers",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12",
}

@Article{Jin:2023:SBS,
  author =       "Hai Jin and Zhuo He and Weizhong Qiang",
  title =        "{SpecTerminator}: Blocking Speculative Side Channels
                 Based on Instruction Classes on {RISC-V}",
  journal =      j-TACO,
  volume =       "20",
  number =       "1",
  pages =        "15:1--15:??",
  month =        mar,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3566053",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Feb 17 06:54:21 MST 2023",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/risc-v.bib;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3566053",
  abstract =     "In modern processors, speculative execution has
                 significantly improved the performance of processors,
                 but it has also introduced speculative execution
                 vulnerabilities. Recent defenses are based on the
                 delayed execution to block various speculative side
                 channels, but we show that several of the current
                 state-of-the-art defenses fail to block some of the
                 available speculative side channels, and the current
                 most secure defense introduces a performance overhead
                 of up to 24.5\%.\par

                 We propose SpecTerminator, the first defense framework
                 based on instruction classes that can comprehensively
                 and precisely block all existing speculative side
                 channels. In SpecTerminator, a novel speculative side
                 channel classification scheme based on the features of
                 secret transmission is proposed, and the sensitive
                 instructions in the speculative window are classified
                 and identified using optimized hardware taint tracking
                 and instruction masking techniques to accurately
                 determine the scope of leakage. Then, according to the
                 execution characteristics of these instructions,
                 dedicated delayed execution strategies, such as TLB
                 request ignoring, selective issue, and extended
                 delay-on-miss, are designed for each type of sensitive
                 instruction to precisely control that these
                 instructions are delayed only in pipeline stages that
                 are at risk of leakage. In contrast to previous
                 defenses based on the Gem5 simulator, we have
                 innovatively implemented defenses against Spectre
                 attacks based on the open-source instruction set RISC-V
                 on an FPGA-accelerated simulation platform that is more
                 similar to real hardware. To evaluate the security of
                 SpecTerminator, we have replicated various existing
                 x86-based Spectre variants on RISC-V. On SPEC 2006,
                 SpecTerminator defends against Spectre attacks based on
                 memory hierarchy side channels with a performance
                 overhead of 2.6\% and against all existing Spectre
                 attacks with a performance overhead of 6.0\%.",
  acknowledgement = ack-nhfb,
  articleno =    "15",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "https://dl.acm.org/loi/taco",
}

@Article{Kuo:2023:RVG,
  author =       "Yao-Ming Kuo and Francisco Garc{\'\i}a-Herrero and
                 Oscar Ruano and Juan Antonio Maestro",
  title =        "{RISC-V} {Galois Field} {ISA} Extension for Non-Binary
                 Error-Correction Codes and Classical and Post-Quantum
                 Cryptography",
  journal =      j-IEEE-TRANS-COMPUT,
  volume =       "72",
  number =       "3",
  pages =        "682--692",
  month =        mar,
  year =         "2023",
  CODEN =        "ITCOB4",
  DOI =          "https://doi.org/10.1109/TC.2022.3174587",
  ISSN =         "0018-9340 (print), 1557-9956 (electronic)",
  ISSN-L =       "0018-9340",
  bibdate =      "Sat Feb 18 16:18:34 2023",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/cryptography2020.bib;
                 http://www.math.utah.edu/pub/tex/bib/ieeetranscomput2020.bib;
                 http://www.math.utah.edu/pub/tex/bib/risc-v.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Transactions on Computers",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12",
}

@Article{Snelgrove:2023:SPT,
  author =       "Martin Snelgrove and Robert Beachler",
  title =        "{speedAI240}: a 2-Petaflop, {30-Teraflops\slash W}
                 At-Memory Inference Acceleration Device With 1456
                 {RISC-V} Cores",
  journal =      j-IEEE-MICRO,
  volume =       "43",
  number =       "3",
  pages =        "58--63",
  month =        may # "\slash " # jun,
  year =         "2023",
  CODEN =        "IEMIDZ",
  DOI =          "https://doi.org/10.1109/MM.2023.3255864",
  ISSN =         "0272-1732 (print), 1937-4143 (electronic)",
  ISSN-L =       "0272-1732",
  bibdate =      "Thu May 18 07:38:12 2023",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/hot-chips.bib;
                 http://www.math.utah.edu/pub/tex/bib/ieeemicro.bib;
                 http://www.math.utah.edu/pub/tex/bib/risc-v.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Micro",
  journal-URL =  "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=40",
}

@Article{Talpes:2023:MDT,
  author =       "Emil Talpes and Debjit Das Sarma and Doug Williams and
                 Sahil Arora and Thomas Kunjan and Benjamin Floering and
                 Ankit Jalote and Christopher Hsiong and Chandrasekhar
                 Poorna and Vaidehi Samant and John Sicilia and Anantha
                 Kumar Nivarti and Raghuvir Ramachandran and Tim Fischer
                 and Ben Herzberg and Bill McGee and Ganesh
                 Venkataramanan and Pete Banon",
  title =        "The Microarchitecture of {DOJO}, {Tesla}'s Exa-Scale
                 Computer",
  journal =      j-IEEE-MICRO,
  volume =       "43",
  number =       "3",
  pages =        "31--39",
  month =        may # "\slash " # jun,
  year =         "2023",
  CODEN =        "IEMIDZ",
  DOI =          "https://doi.org/10.1109/MM.2023.3258906",
  ISSN =         "0272-1732 (print), 1937-4143 (electronic)",
  ISSN-L =       "0272-1732",
  bibdate =      "Thu May 18 07:38:12 2023",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/fparith.bib;
                 http://www.math.utah.edu/pub/tex/bib/hot-chips.bib;
                 http://www.math.utah.edu/pub/tex/bib/ieeemicro.bib;
                 http://www.math.utah.edu/pub/tex/bib/risc-v.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Micro",
  journal-URL =  "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=40",
  remark =       "DOJO is based on RISC-V64 with instruction set
                 extensions. Its arithmetic supports 8-, 16-, 32-, and
                 64-bit integers, and IEEE 754 FP32 (1/8/23), plus FP16
                 (1/5/10), BFP16 (1/8/7), CFP8 (1/4/3), CFP8 (1/5/2),
                 and CFP16 (1/5/10) floating-point formats. The latter
                 is unusual having an external register that records the
                 exponent bias (0, 31, or 63), so that it supports three
                 different ranges of numbers. There is no support for
                 FP64 or longer formats. There is support for stochastic
                 rounding.",
}

@Article{Wen:2023:WCP,
  author =       "Elliott Wen and Gerald Weber and Suranga Nanayakkara",
  title =        "{WasmAndroid}: a Cross-Platform Runtime for Native
                 Programming Languages on {Android}",
  journal =      j-TECS,
  volume =       "22",
  number =       "1",
  pages =        "4:1--4:??",
  month =        jan,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3530286",
  ISSN =         "1539-9087 (print), 1558-3465 (electronic)",
  ISSN-L =       "1539-9087",
  bibdate =      "Sat Mar 11 08:39:25 MST 2023",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/risc-v.bib;
                 http://www.math.utah.edu/pub/tex/bib/tecs.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3530286",
  abstract =     "Open source hardware such as RISC-V has been gaining
                 substantial momentum. Recently, they have begun to
                 embrace Google's Android operating system to leverage
                 its software ecosystem. Despite the encouraging
                 progress, a challenging issue arises: a majority
                 \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  fjournal =     "ACM Transactions on Embedded Computing Systems",
  journal-URL =  "https://dl.acm.org/loi/tecs",
}

@Article{Yang:2023:ATF,
  author =       "Chun-Chieh Yang and Yi-Ru Chen and Hui-Hsin Liao and
                 Yuan-Ming Chang and Jenq-Kuen Lee",
  title =        "Auto-tuning Fixed-point Precision with {TVM} on
                 {RISC-V} Packed {SIMD} Extension",
  journal =      j-TODAES,
  volume =       "28",
  number =       "3",
  pages =        "33:1--33:??",
  month =        may,
  year =         "2023",
  CODEN =        "ATASFO",
  DOI =          "https://doi.org/10.1145/3569939",
  ISSN =         "1084-4309 (print), 1557-7309 (electronic)",
  ISSN-L =       "1084-4309",
  bibdate =      "Wed May 17 08:06:20 MDT 2023",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/fparith.bib;
                 http://www.math.utah.edu/pub/tex/bib/risc-v.bib;
                 http://www.math.utah.edu/pub/tex/bib/todaes.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3569939",
  abstract =     "Today, as deep learning (DL) is applied more often in
                 daily life, dedicated processors such as CPUs and GPUs
                 have become very important for accelerating model
                 executions. With the growth of technology, people are
                 becoming accustomed to using edge devices, such as
                 mobile phones, smart watches, and VR devices in their
                 daily lives. A variety of technologies using DL are
                 gradually being applied to these edge devices. However,
                 there is a large number of computations in DL. It faces
                 a challenging problem how to provide solutions in the
                 edge devices. In this article, the proposed method
                 enables a flow with the RISC-V Packed extension (P
                 extension) in TVM. TVM, an open deep learning compiler
                 for neural network models, is growing as a key
                 infrastructure for DL computing. RISC-V is an open
                 instruction set architecture (ISA) with customized and
                 flexible features. The Packed-SIMD extension is a
                 RISC-V extension that enables subword
                 single-instruction multiple-data (SIMD) computations in
                 RISC-V architectures to support fallback engines in AI
                 computing. In the proposed flow, a fixed-point type
                 that is supported by an integer of 16-bit type and
                 saturation instructions is added to replace the
                 original 32-bit float type. In addition, an auto-tuning
                 method is proposed to use a uniform selector mechanism
                 (USM) to find the binary point position for fixed-point
                 type use. The tensorization feature of TVM can be used
                 to optimize specific hardware such as subword SIMD
                 instructions with RISC-V P extension. With our
                 experiment on the Spike simulator, the proposed method
                 with the USM can improve performance by approximately
                 2.54 to 6.15$ \times $ in terms of instruction counts
                 with little accuracy loss.",
  acknowledgement = ack-nhfb,
  articleno =    "33",
  fjournal =     "ACM Transactions on Design Automation of Electronic
                 Systems",
  journal-URL =  "https://dl.acm.org/loi/todaes",
}

%%% ====================================================================
%%% Cross-referenced entries must come last:
@Proceedings{Burgess:2017:ISC,
  editor =       "Neil Burgess and Javier Bruguera and Florent de
                 Dinechin",
  booktitle =    "{24th IEEE Symposium on Computer Arithmetic (ARITH
                 24), London, UK, 24--26 July 2017}",
  title =        "{2017 IEEE 24th Symposium on Computer Arithmetic
                 (ARITH 24), London, UK, 24--26 July 2017}",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "xii + 198",
  year =         "2017",
  ISBN =         "1-5386-1966-0 (print), 1-5386-1965-2, 1-5386-1964-4",
  ISBN-13 =      "978-1-5386-1966-7 (print), 978-1-5386-1965-0,
                 978-1-5386-1964-3",
  ISSN =         "1063-6889",
  LCCN =         "QA76.9.C62 S95 2017",
  bibdate =      "Fri Nov 17 10:14:11 2017",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/h/higham-nicholas-john.bib;
                 http://www.math.utah.edu/pub/tex/bib/elefunt.bib;
                 http://www.math.utah.edu/pub/tex/bib/fparith.bib;
                 http://www.math.utah.edu/pub/tex/bib/gnu.bib;
                 http://www.math.utah.edu/pub/tex/bib/risc-v.bib",
  URL =          "http://ieeexplore.ieee.org/servlet/opac?punumber=8019911",
  acknowledgement = ack-nhfb,
  keywords =     "ARITH-24; computer arithmetic units; correctness
                 proofs; cryptography; domain specific designs; error
                 analysis; exascale computing; floating point
                 arithmetic; floating-point error analysis; formal
                 verification; function approximation; modular
                 arithmetic; theorem proving; verification",
}

@Proceedings{Gustafson:2019:PCN,
  editor =       "John Gustafson and Vassil Dimitrov",
  booktitle =    "{Proceedings of the Conference for Next Generation
                 Arithmetic 2019, Singapore, March 2019}",
  title =        "{Proceedings of the Conference for Next Generation
                 Arithmetic 2019, Singapore, March 2019}",
  publisher =    pub-ACM,
  address =      pub-ACM:adr,
  pages =        "66",
  year =         "2019",
  ISBN =         "1-4503-7139-6",
  ISBN-13 =      "978-1-4503-7139-1",
  LCCN =         "????",
  bibdate =      "Mon Feb 10 12:06:51 MST 2020",
  bibsource =    "fsz3950.oclc.org:210/WorldCat;
                 http://www.math.utah.edu/pub/tex/bib/fparith.bib;
                 http://www.math.utah.edu/pub/tex/bib/risc-v.bib",
  series =       "ICPS",
  acknowledgement = ack-nhfb,
  meetingname =  "Conference for Next Generation Arithmetic (2019:
                 Singapore)",
  subject =      "Computer arithmetic; Congresses; Computer algorithms;
                 Computer algorithms.; Computer arithmetic.",
}

@Proceedings{Takagi:2019:ISC,
  editor =       "Naofumi Takagi and Sylvie Boldo and Martin
                 Langhammer",
  booktitle =    "{2019 IEEE 26th Symposium on Computer Arithmetic
                 ARITH-26 (2019), Kyoto, Japan, 10--12 June 2019}",
  title =        "{2019 IEEE 26th Symposium on Computer Arithmetic
                 ARITH-26 (2019), Kyoto, Japan, 10--12 June 2019}",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "15 + 220",
  month =        jun,
  year =         "2019",
  DOI =          "https://doi.org/10.1109/ARITH.2019.00001",
  ISBN =         "1-72813-366-1",
  ISBN-13 =      "978-1-72813-366-9",
  ISSN =         "1063-6889",
  ISSN-L =       "1063-6889",
  bibdate =      "Fri Jan 31 08:18:07 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/cryptography2000.bib;
                 http://www.math.utah.edu/pub/tex/bib/elefunt.bib;
                 http://www.math.utah.edu/pub/tex/bib/fparith.bib;
                 http://www.math.utah.edu/pub/tex/bib/risc-v.bib",
  abstract =     "Presents the title page of the proceedings record.",
  acknowledgement = ack-nhfb,
  keywords =     "ARITH-26",
}